From 9fed74d8147b18c6ecdf7ff248f2e151d2f90277 Mon Sep 17 00:00:00 2001 From: Chris Whelan Date: Sun, 26 Jul 2015 19:20:31 -0700 Subject: [PATCH 1/3] Add period.pyx to package --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 30c5d1052d9b3..9b21860a01078 100755 --- a/setup.py +++ b/setup.py @@ -269,6 +269,7 @@ class CheckSDist(sdist_class): 'pandas/index.pyx', 'pandas/algos.pyx', 'pandas/parser.pyx', + 'pandas/src/period.pyx', 'pandas/src/sparse.pyx', 'pandas/src/testing.pyx'] From 703f418b9b07a9c7dda622708851bb30de452773 Mon Sep 17 00:00:00 2001 From: Chris Whelan Date: Sun, 26 Jul 2015 19:20:49 -0700 Subject: [PATCH 2/3] Fixes for vb_suite --- vb_suite/binary_ops.py | 6 ++-- vb_suite/frame_ctor.py | 33 +++++++++++++++--- vb_suite/frame_methods.py | 8 ++--- vb_suite/gil.py | 4 +-- vb_suite/groupby.py | 12 +++---- vb_suite/io_bench.py | 1 + vb_suite/join_merge.py | 32 +++++++++--------- vb_suite/packers.py | 2 +- vb_suite/pandas_vb_common.py | 2 ++ vb_suite/reindex.py | 28 +++++++++++----- vb_suite/sparse.py | 4 +-- vb_suite/timeseries.py | 65 +++++++++++++++++++----------------- 12 files changed, 120 insertions(+), 77 deletions(-) diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index db9a6b730064e..cd8d1ad93b6e1 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -88,7 +88,7 @@ Benchmark("df // 0", setup, name='frame_float_floor_by_zero') setup = common_setup + """ -df = DataFrame(np.random.random_integers((1000, 1000))) +df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) """ frame_int_div_by_zero = \ Benchmark("df / 0", setup, name='frame_int_div_by_zero') @@ -111,8 +111,8 @@ Benchmark("df / df2", setup, name='frame_float_mod') setup = common_setup + """ -df = DataFrame(np.random.random_integers((1000, 1000))) -df2 = DataFrame(np.random.random_integers((1000, 1000))) +df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) +df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) """ frame_int_mod = \ Benchmark("df / df2", setup, name='frame_int_mod') diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py index b11dd6c290ae1..8ad63fc556c2e 100644 --- a/vb_suite/frame_ctor.py +++ b/vb_suite/frame_ctor.py @@ -50,9 +50,30 @@ # offset times 1000 can easily go out of Timestamp bounds and raise errors. dynamic_benchmarks = {} n_steps = [1, 2] +offset_kwargs = {'WeekOfMonth': {'weekday': 1, 'week': 1}, + 'LastWeekOfMonth': {'weekday': 1, 'week': 1}, + 'FY5253': {'startingMonth': 1, 'weekday': 1}, + 'FY5253Quarter': {'qtr_with_extra_week': 1, 'startingMonth': 1, 'weekday': 1}} + +offset_extra_cases = {'FY5253': {'variation': ['nearest', 'last']}, + 'FY5253Quarter': {'variation': ['nearest', 'last']}} + for offset in offsets.__all__: for n in n_steps: - setup = common_setup + """ + kwargs = {} + if offset in offset_kwargs: + kwargs = offset_kwargs[offset] + + if offset in offset_extra_cases: + extras = offset_extra_cases[offset] + else: + extras = {'': ['']} + + for extra_arg in extras: + for extra in extras[extra_arg]: + if extra: + kwargs[extra_arg] = extra + setup = common_setup + """ def get_period_count(start_date, off): ten_offsets_in_days = ((start_date + off * 10) - start_date).days @@ -69,12 +90,14 @@ def get_index_for_offset(off): periods=min(1000, get_period_count(start_date, off)), freq=off) -idx = get_index_for_offset({}({})) +idx = get_index_for_offset({}({}, **{})) df = DataFrame(np.random.randn(len(idx),10), index=idx) d = dict([ (col,df[col]) for col in df.columns ]) -""".format(offset, n) - key = 'frame_ctor_dtindex_{}x{}'.format(offset, n) - dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key) +""".format(offset, n, kwargs) + key = 'frame_ctor_dtindex_{}x{}'.format(offset, n) + if extra: + key += '__{}_{}'.format(extra_arg, extra) + dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key) # Have to stuff them in globals() so vbench detects them globals().update(dynamic_benchmarks) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index 1d7c5e0d9acef..ce5109efe8f6d 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -418,8 +418,8 @@ def f(K=100): #---------------------------------------------------------------------- # equals setup = common_setup + """ -def make_pair(name): - df = globals()[name] +def make_pair(frame): + df = frame df2 = df.copy() df2.ix[-1,-1] = np.nan return df, df2 @@ -437,8 +437,8 @@ def test_unequal(name): nonunique_cols = object_df.copy() nonunique_cols.columns = ['A']*len(nonunique_cols.columns) -pairs = dict([(name,make_pair(name)) - for name in ('float_df', 'object_df', 'nonunique_cols')]) +pairs = dict([(name, make_pair(frame)) + for name, frame in (('float_df', float_df), ('object_df', object_df), ('nonunique_cols', nonunique_cols))]) """ frame_float_equal = Benchmark('test_equal("float_df")', setup) frame_object_equal = Benchmark('test_equal("object_df")', setup) diff --git a/vb_suite/gil.py b/vb_suite/gil.py index 30f41bb3c738d..d5aec7c3e2917 100644 --- a/vb_suite/gil.py +++ b/vb_suite/gil.py @@ -94,5 +94,5 @@ def take_1d_pg2_float64(): """ -nogil_take1d_float64 = Benchmark('take_1d_pg2()_int64', setup, start_date=datetime(2015, 1, 1)) -nogil_take1d_int64 = Benchmark('take_1d_pg2()_float64', setup, start_date=datetime(2015, 1, 1)) +nogil_take1d_float64 = Benchmark('take_1d_pg2_int64()', setup, start_date=datetime(2015, 1, 1)) +nogil_take1d_int64 = Benchmark('take_1d_pg2_float64()', setup, start_date=datetime(2015, 1, 1)) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 73f5f19d6a626..6795b315fc517 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -212,7 +212,7 @@ def f(): 'value3' : np.random.randn(100000)}) """ -stmt = "df.pivot_table(rows='key1', cols=['key2', 'key3'])" +stmt = "df.pivot_table(index='key1', columns=['key2', 'key3'])" groupby_pivot_table = Benchmark(stmt, setup, start_date=datetime(2011, 12, 15)) @@ -243,13 +243,13 @@ def f(): """ groupby_first_float64 = Benchmark('data.groupby(labels).first()', setup, - start_date=datetime(2012, 5, 1)) + start_date=datetime(2012, 5, 1)) groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup, start_date=datetime(2013, 1, 1)) groupby_last_float64 = Benchmark('data.groupby(labels).last()', setup, - start_date=datetime(2012, 5, 1)) + start_date=datetime(2012, 5, 1)) groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup, start_date=datetime(2013, 1, 1)) @@ -259,7 +259,7 @@ def f(): groupby_nth_float32_none = Benchmark('data2.groupby(labels).nth(0)', setup, start_date=datetime(2013, 1, 1)) groupby_nth_float64_any = Benchmark('data.groupby(labels).nth(0,dropna="all")', setup, - start_date=datetime(2012, 5, 1)) + start_date=datetime(2012, 5, 1)) groupby_nth_float32_any = Benchmark('data2.groupby(labels).nth(0,dropna="all")', setup, start_date=datetime(2013, 1, 1)) @@ -269,9 +269,9 @@ def f(): """ groupby_first_datetimes = Benchmark('df.groupby("b").first()', setup, - start_date=datetime(2013, 5, 1)) + start_date=datetime(2013, 5, 1)) groupby_last_datetimes = Benchmark('df.groupby("b").last()', setup, - start_date=datetime(2013, 5, 1)) + start_date=datetime(2013, 5, 1)) groupby_nth_datetimes_none = Benchmark('df.groupby("b").nth(0)', setup, start_date=datetime(2013, 5, 1)) groupby_nth_datetimes_any = Benchmark('df.groupby("b").nth(0,dropna="all")', setup, diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py index a70c543ca59eb..483d61387898d 100644 --- a/vb_suite/io_bench.py +++ b/vb_suite/io_bench.py @@ -2,6 +2,7 @@ from datetime import datetime common_setup = """from pandas_vb_common import * +from StringIO import StringIO """ #---------------------------------------------------------------------- diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py index 02132acb71a33..244c6abe71b05 100644 --- a/vb_suite/join_merge.py +++ b/vb_suite/join_merge.py @@ -31,15 +31,15 @@ except: pass -df = DataFrame({'data1' : np.random.randn(100000), +df = pd.DataFrame({'data1' : np.random.randn(100000), 'data2' : np.random.randn(100000), 'key1' : key1, 'key2' : key2}) -df_key1 = DataFrame(np.random.randn(len(level1), 4), index=level1, +df_key1 = pd.DataFrame(np.random.randn(len(level1), 4), index=level1, columns=['A', 'B', 'C', 'D']) -df_key2 = DataFrame(np.random.randn(len(level2), 4), index=level2, +df_key2 = pd.DataFrame(np.random.randn(len(level2), 4), index=level2, columns=['A', 'B', 'C', 'D']) df_shuf = df.reindex(df.index[shuf]) @@ -69,10 +69,10 @@ #---------------------------------------------------------------------- # Joins on integer keys setup = common_setup + """ -df = DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), +df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000)}) -df2 = DataFrame({'key1': np.arange(500), 'value2': randn(500)}) +df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500)}) df3 = df[:5000] """ @@ -96,9 +96,9 @@ key = np.tile(indices[:8000], 10) key2 = np.tile(indices2[:8000], 10) -left = DataFrame({'key' : key, 'key2':key2, +left = pd.DataFrame({'key' : key, 'key2':key2, 'value' : np.random.randn(80000)}) -right = DataFrame({'key': indices[2000:], 'key2':indices2[2000:], +right = pd.DataFrame({'key': indices[2000:], 'key2':indices2[2000:], 'value2' : np.random.randn(8000)}) """ @@ -112,7 +112,7 @@ # Appending DataFrames setup = common_setup + """ -df1 = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) +df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) df2 = df1.copy() df2.index = np.arange(10000, 20000) mdf1 = df1.copy() @@ -180,7 +180,7 @@ def sample(values, k): start_date=datetime(2012, 2, 27)) setup = common_setup + """ -df = DataFrame(randn(5, 4)) +df = pd.DataFrame(randn(5, 4)) """ concat_small_frames = Benchmark('concat([df] * 1000)', setup, @@ -191,8 +191,8 @@ def sample(values, k): # Concat empty setup = common_setup + """ -df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) -empty = DataFrame() +df = pd.DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) +empty = pd.DataFrame() """ concat_empty_frames1 = Benchmark('concat([df,empty])', setup, @@ -207,11 +207,11 @@ def sample(values, k): setup = common_setup + """ groups = tm.makeStringIndex(10).values -left = DataFrame({'group': groups.repeat(5000), +left = pd.DataFrame({'group': groups.repeat(5000), 'key' : np.tile(np.arange(0, 10000, 2), 10), 'lvalue': np.random.randn(50000)}) -right = DataFrame({'key' : np.arange(10000), +right = pd.DataFrame({'key' : np.arange(10000), 'rvalue' : np.random.randn(10000)}) """ @@ -242,10 +242,10 @@ def sample(values, k): np.random.seed(2718281) n = 50000 -left = DataFrame(np.random.randint(1, n/500, (n, 2)), +left = pd.DataFrame(np.random.randint(1, n/500, (n, 2)), columns=['jim', 'joe']) -right = DataFrame(np.random.randint(1, n/500, (n, 2)), +right = pd.DataFrame(np.random.randint(1, n/500, (n, 2)), columns=['jolie', 'jolia']).set_index('jolie') ''' @@ -255,7 +255,7 @@ def sample(values, k): setup = common_setup + """ low, high, n = -1 << 10, 1 << 10, 1 << 20 -left = DataFrame(np.random.randint(low, high, (n, 7)), +left = pd.DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) left['left'] = left.sum(axis=1) diff --git a/vb_suite/packers.py b/vb_suite/packers.py index 62e0e8fc33b58..60738a62bd287 100644 --- a/vb_suite/packers.py +++ b/vb_suite/packers.py @@ -92,7 +92,7 @@ def remove(f): # hdf table setup = common_setup + """ -df2.to_hdf(f,'df',table=True) +df2.to_hdf(f,'df',format='table') """ packers_read_hdf_table = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date) diff --git a/vb_suite/pandas_vb_common.py b/vb_suite/pandas_vb_common.py index a599301bb53fe..128e262d45d66 100644 --- a/vb_suite/pandas_vb_common.py +++ b/vb_suite/pandas_vb_common.py @@ -1,4 +1,5 @@ from pandas import * +import pandas as pd from datetime import timedelta from numpy.random import randn from numpy.random import randint @@ -7,6 +8,7 @@ import random import numpy as np +np.random.seed(1234) try: import pandas._tseries as lib except: diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py index 156382f1fb13a..07f0e0f7e1bff 100644 --- a/vb_suite/reindex.py +++ b/vb_suite/reindex.py @@ -49,6 +49,18 @@ #---------------------------------------------------------------------- # Pad / backfill +def pad(source_series, target_index): + try: + source_series.reindex(target_index, method='pad') + except: + source_series.reindex(target_index, fillMethod='pad') + +def backfill(source_series, target_index): + try: + source_series.reindex(target_index, method='backfill') + except: + source_series.reindex(target_index, fillMethod='backfill') + setup = common_setup + """ rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) @@ -57,23 +69,23 @@ ts3 = ts2.reindex(ts.index) ts4 = ts3.astype('float32') -def pad(): +def pad(source_series, target_index): try: - ts2.reindex(ts.index, method='pad') + source_series.reindex(target_index, method='pad') except: - ts2.reindex(ts.index, fillMethod='pad') -def backfill(): + source_series.reindex(target_index, fillMethod='pad') +def backfill(source_series, target_index): try: - ts2.reindex(ts.index, method='backfill') + source_series.reindex(target_index, method='backfill') except: - ts2.reindex(ts.index, fillMethod='backfill') + source_series.reindex(target_index, fillMethod='backfill') """ -statement = "pad()" +statement = "pad(ts2, ts.index)" reindex_daterange_pad = Benchmark(statement, setup, name="reindex_daterange_pad") -statement = "backfill()" +statement = "backfill(ts2, ts.index)" reindex_daterange_backfill = Benchmark(statement, setup, name="reindex_daterange_backfill") diff --git a/vb_suite/sparse.py b/vb_suite/sparse.py index e591b197d3384..5da06451fe2d1 100644 --- a/vb_suite/sparse.py +++ b/vb_suite/sparse.py @@ -40,7 +40,7 @@ setup = common_setup + """ -s = pd.Series([nan] * 10000) +s = pd.Series([np.nan] * 10000) s[0] = 3.0 s[100] = -1.0 s[999] = 12.1 @@ -59,7 +59,7 @@ A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)) """ -stmt = "ss = pandas.sparse.series.from_coo(A)" +stmt = "ss = pandas.sparse.series.SparseSeries.from_coo(A)" sparse_series_from_coo = Benchmark(stmt, setup, name="sparse_series_from_coo", start_date=datetime(2015, 1, 3)) diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 75147e079bb65..7f5433980271b 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -1,16 +1,21 @@ from vbench.api import Benchmark from datetime import datetime +from pandas import * -common_setup = """from pandas_vb_common import * -from datetime import timedelta N = 100000 - try: - rng = date_range('1/1/2000', periods=N, freq='min') + rng = date_range(start='1/1/2000', periods=N, freq='min') except NameError: - rng = DatetimeIndex('1/1/2000', periods=N, offset=datetools.Minute()) + rng = DatetimeIndex(start='1/1/2000', periods=N, freq='T') def date_range(start=None, end=None, periods=None, freq=None): - return DatetimeIndex(start, end, periods=periods, offset=freq) + return DatetimeIndex(start=start, end=end, periods=periods, offset=freq) + + +common_setup = """from pandas_vb_common import * +from datetime import timedelta +N = 100000 + +rng = date_range(start='1/1/2000', periods=N, freq='T') if hasattr(Series, 'convert'): Series.resample = Series.convert @@ -22,7 +27,7 @@ def date_range(start=None, end=None, periods=None, freq=None): # Lookup value in large time series, hash map population setup = common_setup + """ -rng = date_range('1/1/2000', periods=1500000, freq='s') +rng = date_range(start='1/1/2000', periods=1500000, freq='S') ts = Series(1, index=rng) """ @@ -69,7 +74,7 @@ def date_range(start=None, end=None, periods=None, freq=None): setup = common_setup + """ N = 100000 -rng = date_range('1/1/2000', periods=N, freq='s') +rng = date_range(start='1/1/2000', periods=N, freq='s') rng = rng.take(np.random.permutation(N)) ts = Series(np.random.randn(N), index=rng) """ @@ -81,7 +86,7 @@ def date_range(start=None, end=None, periods=None, freq=None): # Shifting, add offset setup = common_setup + """ -rng = date_range('1/1/2000', periods=10000, freq='T') +rng = date_range(start='1/1/2000', periods=10000, freq='T') """ datetimeindex_add_offset = Benchmark('rng + timedelta(minutes=2)', setup, @@ -89,9 +94,9 @@ def date_range(start=None, end=None, periods=None, freq=None): setup = common_setup + """ N = 10000 -rng = date_range('1/1/1990', periods=N, freq='53s') +rng = date_range(start='1/1/1990', periods=N, freq='53s') ts = Series(np.random.randn(N), index=rng) -dates = date_range('1/1/1990', periods=N * 10, freq='5s') +dates = date_range(start='1/1/1990', periods=N * 10, freq='5s') """ timeseries_asof_single = Benchmark('ts.asof(dates[0])', setup, start_date=datetime(2012, 4, 27)) @@ -108,7 +113,7 @@ def date_range(start=None, end=None, periods=None, freq=None): # Time zone stuff setup = common_setup + """ -rng = date_range('1/1/2000', '3/1/2000', tz='US/Eastern') +rng = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') """ timeseries_timestamp_tzinfo_cons = \ @@ -118,7 +123,7 @@ def date_range(start=None, end=None, periods=None, freq=None): # Resampling period setup = common_setup + """ -rng = period_range('1/1/2000', '1/1/2001', freq='T') +rng = period_range(start='1/1/2000', end='1/1/2001', freq='T') ts = Series(np.random.randn(len(rng)), index=rng) """ @@ -127,7 +132,7 @@ def date_range(start=None, end=None, periods=None, freq=None): start_date=datetime(2012, 4, 25)) setup = common_setup + """ -rng = date_range('1/1/2000', '1/1/2001', freq='T') +rng = date_range(start='1/1/2000', end='1/1/2001', freq='T') ts = Series(np.random.randn(len(rng)), index=rng) """ @@ -149,7 +154,7 @@ def date_range(start=None, end=None, periods=None, freq=None): # to_datetime setup = common_setup + """ -rng = date_range('1/1/2000', periods=20000, freq='h') +rng = date_range(start='1/1/2000', periods=20000, freq='H') strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in rng] """ @@ -162,7 +167,7 @@ def date_range(start=None, end=None, periods=None, freq=None): start_date=datetime(2012, 7, 11)) setup = common_setup + """ -rng = date_range('1/1/2000', periods=10000, freq='D') +rng = date_range(start='1/1/2000', periods=10000, freq='D') strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str) """ @@ -183,7 +188,7 @@ def date_range(start=None, end=None, periods=None, freq=None): setup = common_setup + """ from pandas.tseries.frequencies import infer_freq -rng = date_range('1/1/1700', freq='D', periods=100000) +rng = date_range(start='1/1/1700', freq='D', periods=100000) a = rng[:50000].append(rng[50002:]) """ @@ -193,7 +198,7 @@ def date_range(start=None, end=None, periods=None, freq=None): # setitem PeriodIndex setup = common_setup + """ -rng = period_range('1/1/1990', freq='S', periods=20000) +rng = period_range(start='1/1/1990', freq='S', periods=20000) df = DataFrame(index=range(len(rng))) """ @@ -202,7 +207,7 @@ def date_range(start=None, end=None, periods=None, freq=None): start_date=datetime(2012, 8, 1)) setup = common_setup + """ -rng = date_range('1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') +rng = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') """ datetimeindex_normalize = \ @@ -211,7 +216,7 @@ def date_range(start=None, end=None, periods=None, freq=None): setup = common_setup + """ from pandas.tseries.offsets import Second -s1 = date_range('1/1/2000', periods=100, freq='S') +s1 = date_range(start='1/1/2000', periods=100, freq='S') curr = s1[-1] slst = [] for i in range(100): @@ -224,7 +229,7 @@ def date_range(start=None, end=None, periods=None, freq=None): setup = common_setup + """ -rng = date_range('1/1/2000', periods=1000, freq='H') +rng = date_range(start='1/1/2000', periods=1000, freq='H') df = DataFrame(np.random.randn(len(rng), 2), rng) """ @@ -232,7 +237,7 @@ def date_range(start=None, end=None, periods=None, freq=None): Benchmark('df.reset_index()', setup, start_date=datetime(2012, 9, 1)) setup = common_setup + """ -rng = date_range('1/1/2000', periods=1000, freq='H', +rng = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') df = DataFrame(np.random.randn(len(rng), 2), index=rng) """ @@ -241,7 +246,7 @@ def date_range(start=None, end=None, periods=None, freq=None): Benchmark('df.reset_index()', setup, start_date=datetime(2012, 9, 1)) setup = common_setup + """ -rng = date_range('1/1/2000', periods=1000, freq='T') +rng = date_range(start='1/1/2000', periods=1000, freq='T') index = rng.repeat(10) """ @@ -251,13 +256,13 @@ def date_range(start=None, end=None, periods=None, freq=None): # tz_localize with infer argument. This is an attempt to emulate the results # of read_csv with duplicated data. Not passing infer_dst will fail setup = common_setup + """ -dst_rng = date_range('10/29/2000 1:00:00', - '10/29/2000 1:59:59', freq='S') -index = date_range('10/29/2000', '10/29/2000 00:59:59', freq='S') +dst_rng = date_range(start='10/29/2000 1:00:00', + end='10/29/2000 1:59:59', freq='S') +index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') index = index.append(dst_rng) index = index.append(dst_rng) -index = index.append(date_range('10/29/2000 2:00:00', - '10/29/2000 3:00:00', freq='S')) +index = index.append(date_range(start='10/29/2000 2:00:00', + end='10/29/2000 3:00:00', freq='S')) """ datetimeindex_infer_dst = \ @@ -269,7 +274,7 @@ def date_range(start=None, end=None, periods=None, freq=None): # Resampling: fast-path various functions setup = common_setup + """ -rng = date_range('20130101',periods=100000,freq='50L') +rng = date_range(start='20130101',periods=100000,freq='50L') df = DataFrame(np.random.randn(100000,2),index=rng) """ @@ -376,7 +381,7 @@ def date_range(start=None, end=None, periods=None, freq=None): setup = common_setup + """ N = 10000 -rng = date_range('1/1/1', periods=N, freq='B') +rng = date_range(start='1/1/1', periods=N, freq='B') """ timeseries_is_month_start = Benchmark('rng.is_month_start', setup, From 201760e4bb9ecbf5d479c210ae270c558235b089 Mon Sep 17 00:00:00 2001 From: Chris Whelan Date: Sun, 26 Jul 2015 19:21:46 -0700 Subject: [PATCH 3/3] PERF: add initial asv config and vbench->asv conversion script --- asv_bench/asv.conf.json | 64 + asv_bench/benchmarks/__init__.py | 0 asv_bench/benchmarks/attrs_caching.py | 23 + asv_bench/benchmarks/binary_ops.py | 236 +++ asv_bench/benchmarks/categoricals.py | 11 + asv_bench/benchmarks/ctors.py | 52 + asv_bench/benchmarks/eval.py | 239 +++ asv_bench/benchmarks/frame_ctor.py | 1706 ++++++++++++++++++++++ asv_bench/benchmarks/frame_methods.py | 936 ++++++++++++ asv_bench/benchmarks/gil.py | 267 ++++ asv_bench/benchmarks/groupby.py | 1683 +++++++++++++++++++++ asv_bench/benchmarks/hdfstore_bench.py | 351 +++++ asv_bench/benchmarks/index_object.py | 292 ++++ asv_bench/benchmarks/indexing.py | 458 ++++++ asv_bench/benchmarks/inference.py | 138 ++ asv_bench/benchmarks/io_bench.py | 135 ++ asv_bench/benchmarks/io_sql.py | 215 +++ asv_bench/benchmarks/join_merge.py | 359 +++++ asv_bench/benchmarks/miscellaneous.py | 30 + asv_bench/benchmarks/packers.py | 857 +++++++++++ asv_bench/benchmarks/pandas_vb_common.py | 1 + asv_bench/benchmarks/panel_ctor.py | 64 + asv_bench/benchmarks/panel_methods.py | 56 + asv_bench/benchmarks/parser_vb.py | 109 ++ asv_bench/benchmarks/plotting.py | 19 + asv_bench/benchmarks/reindex.py | 384 +++++ asv_bench/benchmarks/replace.py | 48 + asv_bench/benchmarks/reshape.py | 76 + asv_bench/benchmarks/series_methods.py | 74 + asv_bench/benchmarks/sparse.py | 55 + asv_bench/benchmarks/stat_ops.py | 236 +++ asv_bench/benchmarks/strings.py | 393 +++++ asv_bench/benchmarks/timedelta.py | 34 + asv_bench/benchmarks/timeseries.py | 1046 +++++++++++++ asv_bench/vbench_to_asv.py | 151 ++ 35 files changed, 10798 insertions(+) create mode 100644 asv_bench/asv.conf.json create mode 100644 asv_bench/benchmarks/__init__.py create mode 100644 asv_bench/benchmarks/attrs_caching.py create mode 100644 asv_bench/benchmarks/binary_ops.py create mode 100644 asv_bench/benchmarks/categoricals.py create mode 100644 asv_bench/benchmarks/ctors.py create mode 100644 asv_bench/benchmarks/eval.py create mode 100644 asv_bench/benchmarks/frame_ctor.py create mode 100644 asv_bench/benchmarks/frame_methods.py create mode 100644 asv_bench/benchmarks/gil.py create mode 100644 asv_bench/benchmarks/groupby.py create mode 100644 asv_bench/benchmarks/hdfstore_bench.py create mode 100644 asv_bench/benchmarks/index_object.py create mode 100644 asv_bench/benchmarks/indexing.py create mode 100644 asv_bench/benchmarks/inference.py create mode 100644 asv_bench/benchmarks/io_bench.py create mode 100644 asv_bench/benchmarks/io_sql.py create mode 100644 asv_bench/benchmarks/join_merge.py create mode 100644 asv_bench/benchmarks/miscellaneous.py create mode 100644 asv_bench/benchmarks/packers.py create mode 120000 asv_bench/benchmarks/pandas_vb_common.py create mode 100644 asv_bench/benchmarks/panel_ctor.py create mode 100644 asv_bench/benchmarks/panel_methods.py create mode 100644 asv_bench/benchmarks/parser_vb.py create mode 100644 asv_bench/benchmarks/plotting.py create mode 100644 asv_bench/benchmarks/reindex.py create mode 100644 asv_bench/benchmarks/replace.py create mode 100644 asv_bench/benchmarks/reshape.py create mode 100644 asv_bench/benchmarks/series_methods.py create mode 100644 asv_bench/benchmarks/sparse.py create mode 100644 asv_bench/benchmarks/stat_ops.py create mode 100644 asv_bench/benchmarks/strings.py create mode 100644 asv_bench/benchmarks/timedelta.py create mode 100644 asv_bench/benchmarks/timeseries.py create mode 100644 asv_bench/vbench_to_asv.py diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json new file mode 100644 index 0000000000000..ddb6d97de43b5 --- /dev/null +++ b/asv_bench/asv.conf.json @@ -0,0 +1,64 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "pandas", + + // The project's homepage + "project_url": "http://pandas.pydata.org/", + + // The URL of the source code repository for the project being + // benchmarked + "repo": "..", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/pydata/pandas/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + "pythons": ["2.7", "3.4"], + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list indicates to just test against the default (latest) + // version. + "matrix": { + // To run against multiple versions, replace with + // "numpy": ["1.7", "1.9"], + "numpy": [], + "Cython": [], + "matplotlib": [], + "sqlalchemy": [], + "scipy": [], + "pytables": [], + }, + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + // "env_dir": "env", + + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + // "results_dir": "results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + // "html_dir": "html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8 +} diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py new file mode 100644 index 0000000000000..ecb91923dc663 --- /dev/null +++ b/asv_bench/benchmarks/attrs_caching.py @@ -0,0 +1,23 @@ +from pandas_vb_common import * + + +class getattr_dataframe_index(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10, 6)) + self.cur_index = self.df.index + + def time_getattr_dataframe_index(self): + self.foo = self.df.index + + +class setattr_dataframe_index(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10, 6)) + self.cur_index = self.df.index + + def time_setattr_dataframe_index(self): + self.df.index = self.cur_index \ No newline at end of file diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py new file mode 100644 index 0000000000000..13976014ec6f1 --- /dev/null +++ b/asv_bench/benchmarks/binary_ops.py @@ -0,0 +1,236 @@ +from pandas_vb_common import * +import pandas.computation.expressions as expr + + +class frame_add(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + + def time_frame_add(self): + (self.df + self.df2) + + +class frame_add_no_ne(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_use_numexpr(False) + + def time_frame_add_no_ne(self): + (self.df + self.df2) + + def teardown(self): + expr.set_use_numexpr(True) + + +class frame_add_st(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_frame_add_st(self): + (self.df + self.df2) + + def teardown(self): + expr.set_numexpr_threads() + + +class frame_float_div(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + self.df2 = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_float_div(self): + (self.df // self.df2) + + +class frame_float_div_by_zero(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_float_div_by_zero(self): + (self.df / 0) + + +class frame_float_floor_by_zero(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_float_floor_by_zero(self): + (self.df // 0) + + +class frame_float_mod(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + self.df2 = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_float_mod(self): + (self.df / self.df2) + + +class frame_int_div_by_zero(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + + def time_frame_int_div_by_zero(self): + (self.df / 0) + + +class frame_int_mod(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + self.df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + + def time_frame_int_mod(self): + (self.df / self.df2) + + +class frame_mult(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + + def time_frame_mult(self): + (self.df * self.df2) + + +class frame_mult_no_ne(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_use_numexpr(False) + + def time_frame_mult_no_ne(self): + (self.df * self.df2) + + def teardown(self): + expr.set_use_numexpr(True) + + +class frame_mult_st(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_frame_mult_st(self): + (self.df * self.df2) + + def teardown(self): + expr.set_numexpr_threads() + + +class frame_multi_and(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + + def time_frame_multi_and(self): + self.df[((self.df > 0) & (self.df2 > 0))] + + +class frame_multi_and_no_ne(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_use_numexpr(False) + + def time_frame_multi_and_no_ne(self): + self.df[((self.df > 0) & (self.df2 > 0))] + + def teardown(self): + expr.set_use_numexpr(True) + + +class frame_multi_and_st(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_frame_multi_and_st(self): + self.df[((self.df > 0) & (self.df2 > 0))] + + def teardown(self): + expr.set_numexpr_threads() + + +class series_timestamp_compare(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.halfway = ((self.N // 2) - 1) + self.s = Series(date_range('20010101', periods=self.N, freq='T')) + self.ts = self.s[self.halfway] + + def time_series_timestamp_compare(self): + (self.s <= self.ts) + + +class timestamp_ops_diff1(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.s = Series(date_range('20010101', periods=self.N, freq='s')) + + def time_timestamp_ops_diff1(self): + self.s.diff() + + +class timestamp_ops_diff2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.s = Series(date_range('20010101', periods=self.N, freq='s')) + + def time_timestamp_ops_diff2(self): + (self.s - self.s.shift()) + + +class timestamp_series_compare(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.halfway = ((self.N // 2) - 1) + self.s = Series(date_range('20010101', periods=self.N, freq='T')) + self.ts = self.s[self.halfway] + + def time_timestamp_series_compare(self): + (self.ts >= self.s) \ No newline at end of file diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py new file mode 100644 index 0000000000000..34caef221a340 --- /dev/null +++ b/asv_bench/benchmarks/categoricals.py @@ -0,0 +1,11 @@ +from pandas_vb_common import * + + +class concat_categorical(object): + goal_time = 0.2 + + def setup(self): + self.s = pd.Series((list('aabbcd') * 1000000)).astype('category') + + def time_concat_categorical(self): + concat([self.s, self.s]) \ No newline at end of file diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py new file mode 100644 index 0000000000000..b48211b3db83e --- /dev/null +++ b/asv_bench/benchmarks/ctors.py @@ -0,0 +1,52 @@ +from pandas_vb_common import * + + +class frame_constructor_ndarray(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randn(100, 100) + + def time_frame_constructor_ndarray(self): + DataFrame(self.arr) + + +class ctor_index_array_string(object): + goal_time = 0.2 + + def setup(self): + self.data = np.array(['foo', 'bar', 'baz'], dtype=object) + + def time_ctor_index_array_string(self): + Index(self.data) + + +class series_constructor_ndarray(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(100) + self.index = Index(np.arange(100)) + + def time_series_constructor_ndarray(self): + Series(self.data, index=self.index) + + +class dtindex_from_series_ctor(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000)) + + def time_dtindex_from_series_ctor(self): + DatetimeIndex(self.s) + + +class index_from_series_ctor(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000)) + + def time_index_from_series_ctor(self): + Index(self.s) \ No newline at end of file diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py new file mode 100644 index 0000000000000..397312355aa47 --- /dev/null +++ b/asv_bench/benchmarks/eval.py @@ -0,0 +1,239 @@ +from pandas_vb_common import * +import pandas.computation.expressions as expr +import pandas as pd + + +class eval_frame_add_all_threads(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + + def time_eval_frame_add_all_threads(self): + pd.eval('df + df2 + df3 + df4') + + +class eval_frame_add_one_thread(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_eval_frame_add_one_thread(self): + pd.eval('df + df2 + df3 + df4') + + +class eval_frame_add_python(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + + def time_eval_frame_add_python(self): + pd.eval('df + df2 + df3 + df4', engine='python') + + +class eval_frame_add_python_one_thread(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_eval_frame_add_python_one_thread(self): + pd.eval('df + df2 + df3 + df4', engine='python') + + +class eval_frame_and_all_threads(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + + def time_eval_frame_and_all_threads(self): + pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') + + +class eval_frame_and_python_one_thread(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_eval_frame_and_python_one_thread(self): + pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python') + + +class eval_frame_and_python(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + + def time_eval_frame_and_python(self): + pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python') + + +class eval_frame_chained_cmp_all_threads(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + + def time_eval_frame_chained_cmp_all_threads(self): + pd.eval('df < df2 < df3 < df4') + + +class eval_frame_chained_cmp_python_one_thread(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_eval_frame_chained_cmp_python_one_thread(self): + pd.eval('df < df2 < df3 < df4', engine='python') + + +class eval_frame_chained_cmp_python(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + + def time_eval_frame_chained_cmp_python(self): + pd.eval('df < df2 < df3 < df4', engine='python') + + +class eval_frame_mult_all_threads(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + + def time_eval_frame_mult_all_threads(self): + pd.eval('df * df2 * df3 * df4') + + +class eval_frame_mult_one_thread(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_eval_frame_mult_one_thread(self): + pd.eval('df * df2 * df3 * df4') + + +class eval_frame_mult_python(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + + def time_eval_frame_mult_python(self): + pd.eval('df * df2 * df3 * df4', engine='python') + + +class eval_frame_mult_python_one_thread(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(20000, 100)) + self.df2 = DataFrame(np.random.randn(20000, 100)) + self.df3 = DataFrame(np.random.randn(20000, 100)) + self.df4 = DataFrame(np.random.randn(20000, 100)) + expr.set_numexpr_threads(1) + + def time_eval_frame_mult_python_one_thread(self): + pd.eval('df * df2 * df3 * df4', engine='python') + + +class query_datetime_index(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.halfway = ((self.N // 2) - 1) + self.index = date_range('20010101', periods=self.N, freq='T') + self.s = Series(self.index) + self.ts = self.s.iloc[self.halfway] + self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index) + + def time_query_datetime_index(self): + self.df.query('index < @ts') + + +class query_datetime_series(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.halfway = ((self.N // 2) - 1) + self.index = date_range('20010101', periods=self.N, freq='T') + self.s = Series(self.index) + self.ts = self.s.iloc[self.halfway] + self.df = DataFrame({'dates': self.s.values, }) + + def time_query_datetime_series(self): + self.df.query('dates < @ts') + + +class query_with_boolean_selection(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.halfway = ((self.N // 2) - 1) + self.index = date_range('20010101', periods=self.N, freq='T') + self.s = Series(self.index) + self.ts = self.s.iloc[self.halfway] + self.N = 1000000 + self.df = DataFrame({'a': np.random.randn(self.N), }) + self.min_val = self.df['a'].min() + self.max_val = self.df['a'].max() + + def time_query_with_boolean_selection(self): + self.df.query('(a >= @min_val) & (a <= @max_val)') \ No newline at end of file diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py new file mode 100644 index 0000000000000..2cb337e0e6b9d --- /dev/null +++ b/asv_bench/benchmarks/frame_ctor.py @@ -0,0 +1,1706 @@ +from pandas_vb_common import * +try: + from pandas.tseries.offsets import * +except: + from pandas.core.datetools import * + + +class frame_ctor_dtindex_BDayx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BDay(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BDayx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BDayx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BDay(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BDayx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BMonthBeginx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BMonthBegin(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BMonthBeginx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BMonthBeginx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BMonthBegin(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BMonthBeginx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BMonthEndx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BMonthEnd(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BMonthEndx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BMonthEndx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BMonthEnd(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BMonthEndx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BQuarterBeginx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BQuarterBegin(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BQuarterBeginx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BQuarterBeginx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BQuarterBegin(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BQuarterBeginx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BQuarterEndx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BQuarterEnd(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BQuarterEndx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BQuarterEndx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BQuarterEnd(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BQuarterEndx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BYearBeginx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BYearBegin(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BYearBeginx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BYearBeginx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BYearBegin(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BYearBeginx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BYearEndx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BYearEnd(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BYearEndx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BYearEndx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BYearEnd(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BYearEndx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BusinessDayx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BusinessDay(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BusinessDayx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BusinessDayx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BusinessDay(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BusinessDayx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BusinessHourx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BusinessHour(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BusinessHourx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_BusinessHourx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(BusinessHour(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_BusinessHourx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_CBMonthBeginx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(CBMonthBegin(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_CBMonthBeginx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_CBMonthBeginx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(CBMonthBegin(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_CBMonthBeginx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_CBMonthEndx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(CBMonthEnd(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_CBMonthEndx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_CBMonthEndx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(CBMonthEnd(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_CBMonthEndx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_CDayx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(CDay(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_CDayx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_CDayx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(CDay(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_CDayx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_CustomBusinessDayx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(CustomBusinessDay(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_CustomBusinessDayx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_CustomBusinessDayx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(CustomBusinessDay(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_CustomBusinessDayx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_DateOffsetx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(DateOffset(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_DateOffsetx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_DateOffsetx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(DateOffset(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_DateOffsetx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Dayx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Day(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Dayx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Dayx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Day(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Dayx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Easterx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Easter(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Easterx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Easterx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Easter(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Easterx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_FY5253Quarterx1__variation_last(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_FY5253Quarterx1__variation_last(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_FY5253Quarterx2__variation_last(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_FY5253Quarterx2__variation_last(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_FY5253x1__variation_last(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_FY5253x1__variation_last(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_FY5253x1__variation_nearest(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_FY5253x1__variation_nearest(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_FY5253x2__variation_last(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_FY5253x2__variation_last(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_FY5253x2__variation_nearest(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_FY5253x2__variation_nearest(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Hourx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Hour(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Hourx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Hourx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Hour(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Hourx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_LastWeekOfMonthx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(LastWeekOfMonth(1, **{'week': 1, 'weekday': 1, })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_LastWeekOfMonthx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_LastWeekOfMonthx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(LastWeekOfMonth(2, **{'week': 1, 'weekday': 1, })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_LastWeekOfMonthx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Microx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Micro(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Microx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Microx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Micro(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Microx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Millix1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Milli(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Millix1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Millix2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Milli(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Millix2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Minutex1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Minute(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Minutex1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Minutex2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Minute(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Minutex2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_MonthBeginx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(MonthBegin(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_MonthBeginx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_MonthBeginx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(MonthBegin(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_MonthBeginx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_MonthEndx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(MonthEnd(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_MonthEndx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_MonthEndx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(MonthEnd(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_MonthEndx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Nanox1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Nano(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Nanox1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Nanox2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Nano(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Nanox2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_QuarterBeginx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(QuarterBegin(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_QuarterBeginx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_QuarterBeginx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(QuarterBegin(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_QuarterBeginx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_QuarterEndx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(QuarterEnd(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_QuarterEndx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_QuarterEndx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(QuarterEnd(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_QuarterEndx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Secondx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Second(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Secondx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Secondx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Second(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Secondx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_WeekOfMonthx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(WeekOfMonth(1, **{'week': 1, 'weekday': 1, })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_WeekOfMonthx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_WeekOfMonthx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(WeekOfMonth(2, **{'week': 1, 'weekday': 1, })) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_WeekOfMonthx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Weekx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Week(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Weekx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_Weekx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(Week(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_Weekx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_YearBeginx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(YearBegin(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_YearBeginx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_YearBeginx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(YearBegin(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_YearBeginx2(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_YearEndx1(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(YearEnd(1, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_YearEndx1(self): + DataFrame(self.d) + + +class frame_ctor_dtindex_YearEndx2(object): + goal_time = 0.2 + + def setup(self): + + def get_period_count(start_date, off): + self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (self.ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) + + def get_index_for_offset(off): + self.start_date = Timestamp('1/1/1900') + return date_range(self.start_date, periods=min(1000, get_period_count(self.start_date, off)), freq=off) + self.idx = get_index_for_offset(YearEnd(2, **{})) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor_dtindex_YearEndx2(self): + DataFrame(self.d) + + +class frame_ctor_list_of_dict(object): + goal_time = 0.2 + + def setup(self): + (N, K) = (5000, 50) + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) + try: + self.data = self.frame.to_dict() + except: + self.data = self.frame.toDict() + self.some_dict = self.data.values()[0] + self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] + + def time_frame_ctor_list_of_dict(self): + DataFrame(self.dict_list) + + +class frame_ctor_nested_dict(object): + goal_time = 0.2 + + def setup(self): + (N, K) = (5000, 50) + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) + try: + self.data = self.frame.to_dict() + except: + self.data = self.frame.toDict() + self.some_dict = self.data.values()[0] + self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] + + def time_frame_ctor_nested_dict(self): + DataFrame(self.data) + + +class frame_ctor_nested_dict_int64(object): + goal_time = 0.2 + + def setup(self): + self.data = dict(((i, dict(((j, float(j)) for j in xrange(100)))) for i in xrange(2000))) + + def time_frame_ctor_nested_dict_int64(self): + DataFrame(self.data) + + +class frame_from_series(object): + goal_time = 0.2 + + def setup(self): + self.mi = MultiIndex.from_tuples([(x, y) for x in range(100) for y in range(100)]) + self.s = Series(randn(10000), index=self.mi) + + def time_frame_from_series(self): + DataFrame(self.s) + + +class frame_get_numeric_data(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 25)) + self.df['foo'] = 'bar' + self.df['bar'] = 'baz' + self.df = self.df.consolidate() + + def time_frame_get_numeric_data(self): + self.df._get_numeric_data() + + +class series_ctor_from_dict(object): + goal_time = 0.2 + + def setup(self): + (N, K) = (5000, 50) + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) + try: + self.data = self.frame.to_dict() + except: + self.data = self.frame.toDict() + self.some_dict = self.data.values()[0] + self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] + + def time_series_ctor_from_dict(self): + Series(self.some_dict) \ No newline at end of file diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py new file mode 100644 index 0000000000000..2bd51201b45ca --- /dev/null +++ b/asv_bench/benchmarks/frame_methods.py @@ -0,0 +1,936 @@ +from pandas_vb_common import * + + +class frame_apply_axis_1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 100)) + + def time_frame_apply_axis_1(self): + self.df.apply((lambda x: (x + 1)), axis=1) + + +class frame_apply_lambda_mean(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 100)) + + def time_frame_apply_lambda_mean(self): + self.df.apply((lambda x: x.sum())) + + +class frame_apply_np_mean(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 100)) + + def time_frame_apply_np_mean(self): + self.df.apply(np.mean) + + +class frame_apply_pass_thru(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 100)) + + def time_frame_apply_pass_thru(self): + self.df.apply((lambda x: x)) + + +class frame_apply_ref_by_name(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + + def time_frame_apply_ref_by_name(self): + self.df.apply((lambda x: (x['A'] + x['B'])), axis=1) + + +class frame_apply_user_func(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.arange(1028.0)) + self.df = DataFrame({i: self.s for i in range(1028)}) + + def time_frame_apply_user_func(self): + self.df.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) + + +class frame_assign_timeseries_index(object): + goal_time = 0.2 + + def setup(self): + self.idx = date_range('1/1/2000', periods=100000, freq='D') + self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) + + def f(x): + self.x = self.x.copy() + self.x['date'] = self.x.index + + def time_frame_assign_timeseries_index(self): + f(self.df) + + +class frame_boolean_row_select(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 100)) + self.bool_arr = np.zeros(10000, dtype=bool) + self.bool_arr[:1000] = True + + def time_frame_boolean_row_select(self): + self.df[self.bool_arr] + + +class frame_count_level_axis0_mixed_dtypes_multi(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df['foo'] = 'bar' + self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) + self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) + + def time_frame_count_level_axis0_mixed_dtypes_multi(self): + self.df.count(axis=0, level=1) + + +class frame_count_level_axis0_multi(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) + self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) + + def time_frame_count_level_axis0_multi(self): + self.df.count(axis=0, level=1) + + +class frame_count_level_axis1_mixed_dtypes_multi(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df['foo'] = 'bar' + self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) + self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) + + def time_frame_count_level_axis1_mixed_dtypes_multi(self): + self.df.count(axis=1, level=1) + + +class frame_count_level_axis1_multi(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) + self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) + + def time_frame_count_level_axis1_multi(self): + self.df.count(axis=1, level=1) + + +class frame_dropna_axis0_all(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + + def time_frame_dropna_axis0_all(self): + self.df.dropna(how='all', axis=0) + + +class frame_dropna_axis0_all_mixed_dtypes(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df['foo'] = 'bar' + + def time_frame_dropna_axis0_all_mixed_dtypes(self): + self.df.dropna(how='all', axis=0) + + +class frame_dropna_axis0_any(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + + def time_frame_dropna_axis0_any(self): + self.df.dropna(how='any', axis=0) + + +class frame_dropna_axis0_any_mixed_dtypes(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df['foo'] = 'bar' + + def time_frame_dropna_axis0_any_mixed_dtypes(self): + self.df.dropna(how='any', axis=0) + + +class frame_dropna_axis1_all(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + + def time_frame_dropna_axis1_all(self): + self.df.dropna(how='all', axis=1) + + +class frame_dropna_axis1_all_mixed_dtypes(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df['foo'] = 'bar' + + def time_frame_dropna_axis1_all_mixed_dtypes(self): + self.df.dropna(how='all', axis=1) + + +class frame_dropna_axis1_any(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + + def time_frame_dropna_axis1_any(self): + self.df.dropna(how='any', axis=1) + + +class frame_dropna_axis1_any_mixed_dtypes(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df['foo'] = 'bar' + + def time_frame_dropna_axis1_any_mixed_dtypes(self): + self.df.dropna(how='any', axis=1) + + +class frame_dtypes(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 1000)) + + def time_frame_dtypes(self): + self.df.dtypes + + +class frame_duplicated(object): + goal_time = 0.2 + + def setup(self): + self.n = (1 << 20) + self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) + self.xs = np.random.randn((self.n // 64)).round(2) + self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) + + def time_frame_duplicated(self): + self.df.duplicated() + + +class frame_fancy_lookup(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + self.df['foo'] = 'bar' + self.row_labels = list(self.df.index[::10])[:900] + self.col_labels = (list(self.df.columns) * 100) + self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') + self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') + + def time_frame_fancy_lookup(self): + self.df.lookup(self.row_labels, self.col_labels) + + +class frame_fancy_lookup_all(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + self.df['foo'] = 'bar' + self.row_labels = list(self.df.index[::10])[:900] + self.col_labels = (list(self.df.columns) * 100) + self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') + self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') + + def time_frame_fancy_lookup_all(self): + self.df.lookup(self.row_labels_all, self.col_labels_all) + + +class frame_fillna_inplace(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 100)) + self.df.values[::2] = np.nan + + def time_frame_fillna_inplace(self): + self.df.fillna(0, inplace=True) + + +class frame_float_equal(object): + goal_time = 0.2 + + def setup(self): + + def make_pair(frame): + self.df = frame + self.df2 = self.df.copy() + self.df2.ix[((-1), (-1))] = np.nan + return (self.df, self.df2) + + def test_equal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df) + + def test_unequal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df2) + self.float_df = DataFrame(np.random.randn(1000, 1000)) + self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) + self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + + def time_frame_float_equal(self): + test_equal('float_df') + + +class frame_float_unequal(object): + goal_time = 0.2 + + def setup(self): + + def make_pair(frame): + self.df = frame + self.df2 = self.df.copy() + self.df2.ix[((-1), (-1))] = np.nan + return (self.df, self.df2) + + def test_equal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df) + + def test_unequal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df2) + self.float_df = DataFrame(np.random.randn(1000, 1000)) + self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) + self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + + def time_frame_float_unequal(self): + test_unequal('float_df') + + +class frame_from_records_generator(object): + goal_time = 0.2 + + def setup(self): + + def get_data(n=100000): + return ((x, (x * 20), (x * 100)) for x in xrange(n)) + + def time_frame_from_records_generator(self): + self.df = DataFrame.from_records(get_data()) + + +class frame_from_records_generator_nrows(object): + goal_time = 0.2 + + def setup(self): + + def get_data(n=100000): + return ((x, (x * 20), (x * 100)) for x in xrange(n)) + + def time_frame_from_records_generator_nrows(self): + self.df = DataFrame.from_records(get_data(), nrows=1000) + + +class frame_get_dtype_counts(object): + goal_time = 0.2 + + def setup(self): + self.df = pandas.DataFrame(np.random.randn(10, 10000)) + + def time_frame_get_dtype_counts(self): + self.df.get_dtype_counts() + + +class frame_getitem_single_column(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 1000)) + self.df2 = DataFrame(randn(3000, 1), columns=['A']) + self.df3 = DataFrame(randn(3000, 1)) + + def f(): + if hasattr(self.df, '_item_cache'): + self.df._item_cache.clear() + for (name, col) in self.df.iteritems(): + pass + + def g(): + for (name, col) in self.df.iteritems(): + pass + + def h(): + for i in xrange(10000): + self.df2['A'] + + def j(): + for i in xrange(10000): + self.df3[0] + + def time_frame_getitem_single_column(self): + h() + + +class frame_getitem_single_column2(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 1000)) + self.df2 = DataFrame(randn(3000, 1), columns=['A']) + self.df3 = DataFrame(randn(3000, 1)) + + def f(): + if hasattr(self.df, '_item_cache'): + self.df._item_cache.clear() + for (name, col) in self.df.iteritems(): + pass + + def g(): + for (name, col) in self.df.iteritems(): + pass + + def h(): + for i in xrange(10000): + self.df2['A'] + + def j(): + for i in xrange(10000): + self.df3[0] + + def time_frame_getitem_single_column2(self): + j() + + +class frame_html_repr_trunc_mi(object): + goal_time = 0.2 + + def setup(self): + self.nrows = 10000 + self.data = randn(self.nrows, 10) + self.idx = MultiIndex.from_arrays(np.tile(randn(3, (self.nrows / 100)), 100)) + self.df = DataFrame(self.data, index=self.idx) + + def time_frame_html_repr_trunc_mi(self): + self.df._repr_html_() + + +class frame_html_repr_trunc_si(object): + goal_time = 0.2 + + def setup(self): + self.nrows = 10000 + self.data = randn(self.nrows, 10) + self.idx = randn(self.nrows) + self.df = DataFrame(self.data, index=self.idx) + + def time_frame_html_repr_trunc_si(self): + self.df._repr_html_() + + +class frame_insert_100_columns_begin(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000 + + def f(K=100): + self.df = DataFrame(index=range(self.N)) + self.new_col = np.random.randn(self.N) + for i in range(K): + self.df.insert(0, i, self.new_col) + + def time_frame_insert_100_columns_begin(self): + f() + + +class frame_insert_500_columns_end(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000 + + def f(K=500): + self.df = DataFrame(index=range(self.N)) + self.new_col = np.random.randn(self.N) + for i in range(K): + self.df[i] = self.new_col + + def time_frame_insert_500_columns_end(self): + f() + + +class frame_interpolate(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 100)) + self.df.values[::2] = np.nan + + def time_frame_interpolate(self): + self.df.interpolate() + + +class frame_interpolate_some_good(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), }) + self.df.loc[1::5, 'A'] = np.nan + self.df.loc[1::5, 'C'] = np.nan + + def time_frame_interpolate_some_good(self): + self.df.interpolate() + + +class frame_interpolate_some_good_infer(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), }) + self.df.loc[1::5, 'A'] = np.nan + self.df.loc[1::5, 'C'] = np.nan + + def time_frame_interpolate_some_good_infer(self): + self.df.interpolate(downcast='infer') + + +class frame_isnull(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(1000, 1000) + self.df = DataFrame(self.data) + + def time_frame_isnull(self): + isnull(self.df) + + +class frame_iteritems(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 1000)) + self.df2 = DataFrame(randn(3000, 1), columns=['A']) + self.df3 = DataFrame(randn(3000, 1)) + + def f(): + if hasattr(self.df, '_item_cache'): + self.df._item_cache.clear() + for (name, col) in self.df.iteritems(): + pass + + def g(): + for (name, col) in self.df.iteritems(): + pass + + def h(): + for i in xrange(10000): + self.df2['A'] + + def j(): + for i in xrange(10000): + self.df3[0] + + def time_frame_iteritems(self): + f() + + +class frame_iteritems_cached(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 1000)) + self.df2 = DataFrame(randn(3000, 1), columns=['A']) + self.df3 = DataFrame(randn(3000, 1)) + + def f(): + if hasattr(self.df, '_item_cache'): + self.df._item_cache.clear() + for (name, col) in self.df.iteritems(): + pass + + def g(): + for (name, col) in self.df.iteritems(): + pass + + def h(): + for i in xrange(10000): + self.df2['A'] + + def j(): + for i in xrange(10000): + self.df3[0] + + def time_frame_iteritems_cached(self): + g() + + +class frame_mask_bools(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(1000, 500) + self.df = DataFrame(self.data) + self.df = self.df.where((self.df > 0)) + self.bools = (self.df > 0) + self.mask = isnull(self.df) + + def time_frame_mask_bools(self): + self.bools.mask(self.mask) + + +class frame_mask_floats(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(1000, 500) + self.df = DataFrame(self.data) + self.df = self.df.where((self.df > 0)) + self.bools = (self.df > 0) + self.mask = isnull(self.df) + + def time_frame_mask_floats(self): + self.bools.astype(float).mask(self.mask) + + +class frame_nonunique_equal(object): + goal_time = 0.2 + + def setup(self): + + def make_pair(frame): + self.df = frame + self.df2 = self.df.copy() + self.df2.ix[((-1), (-1))] = np.nan + return (self.df, self.df2) + + def test_equal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df) + + def test_unequal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df2) + self.float_df = DataFrame(np.random.randn(1000, 1000)) + self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) + self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + + def time_frame_nonunique_equal(self): + test_equal('nonunique_cols') + + +class frame_nonunique_unequal(object): + goal_time = 0.2 + + def setup(self): + + def make_pair(frame): + self.df = frame + self.df2 = self.df.copy() + self.df2.ix[((-1), (-1))] = np.nan + return (self.df, self.df2) + + def test_equal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df) + + def test_unequal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df2) + self.float_df = DataFrame(np.random.randn(1000, 1000)) + self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) + self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + + def time_frame_nonunique_unequal(self): + test_unequal('nonunique_cols') + + +class frame_object_equal(object): + goal_time = 0.2 + + def setup(self): + + def make_pair(frame): + self.df = frame + self.df2 = self.df.copy() + self.df2.ix[((-1), (-1))] = np.nan + return (self.df, self.df2) + + def test_equal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df) + + def test_unequal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df2) + self.float_df = DataFrame(np.random.randn(1000, 1000)) + self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) + self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + + def time_frame_object_equal(self): + test_equal('object_df') + + +class frame_object_unequal(object): + goal_time = 0.2 + + def setup(self): + + def make_pair(frame): + self.df = frame + self.df2 = self.df.copy() + self.df2.ix[((-1), (-1))] = np.nan + return (self.df, self.df2) + + def test_equal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df) + + def test_unequal(name): + (self.df, self.df2) = pairs[name] + return self.df.equals(self.df2) + self.float_df = DataFrame(np.random.randn(1000, 1000)) + self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) + self.pairs = dict([(name, make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + + def time_frame_object_unequal(self): + test_unequal('object_df') + + +class frame_reindex_axis0(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 10000)) + self.idx = np.arange(4000, 7000) + + def time_frame_reindex_axis0(self): + self.df.reindex(self.idx) + + +class frame_reindex_axis1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 10000)) + self.idx = np.arange(4000, 7000) + + def time_frame_reindex_axis1(self): + self.df.reindex(columns=self.idx) + + +class frame_reindex_both_axes(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 10000)) + self.idx = np.arange(4000, 7000) + + def time_frame_reindex_both_axes(self): + self.df.reindex(index=self.idx, columns=self.idx) + + +class frame_reindex_both_axes_ix(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(10000, 10000)) + self.idx = np.arange(4000, 7000) + + def time_frame_reindex_both_axes_ix(self): + self.df.ix[(self.idx, self.idx)] + + +class frame_reindex_upcast(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), 1: randint(0, 1000, 1000).astype(np.int16), 2: randint(0, 1000, 1000).astype(np.int32), 3: randint(0, 1000, 1000).astype(np.int64), }[randint(0, 4)]) for c in range(1000)])) + + def time_frame_reindex_upcast(self): + self.df.reindex(permutation(range(1200))) + + +class frame_repr_tall(object): + goal_time = 0.2 + + def setup(self): + self.df = pandas.DataFrame(np.random.randn(10000, 10)) + + def time_frame_repr_tall(self): + repr(self.df) + + +class frame_repr_wide(object): + goal_time = 0.2 + + def setup(self): + self.df = pandas.DataFrame(np.random.randn(10, 10000)) + + def time_frame_repr_wide(self): + repr(self.df) + + +class frame_shift_axis0(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.rand(10000, 500)) + + def time_frame_shift_axis0(self): + self.df.shift(1, axis=0) + + +class frame_shift_axis_1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.rand(10000, 500)) + + def time_frame_shift_axis_1(self): + self.df.shift(1, axis=1) + + +class frame_to_html_mixed(object): + goal_time = 0.2 + + def setup(self): + self.nrows = 500 + self.df = DataFrame(randn(self.nrows, 10)) + self.df[0] = period_range('2000', '2010', self.nrows) + self.df[1] = range(self.nrows) + + def time_frame_to_html_mixed(self): + self.df.to_html() + + +class frame_to_string_floats(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(100, 10)) + + def time_frame_to_string_floats(self): + self.df.to_string() + + +class frame_xs_col(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(1, 100000)) + + def time_frame_xs_col(self): + self.df.xs(50000, axis=1) + + +class frame_xs_row(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(randn(100000, 1)) + + def time_frame_xs_row(self): + self.df.xs(50000) + + +class series_string_vector_slice(object): + goal_time = 0.2 + + def setup(self): + self.s = Series((['abcdefg', np.nan] * 500000)) + + def time_series_string_vector_slice(self): + self.s.str[:5] \ No newline at end of file diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py new file mode 100644 index 0000000000000..b0486617a52af --- /dev/null +++ b/asv_bench/benchmarks/gil.py @@ -0,0 +1,267 @@ +from pandas_vb_common import * +from pandas.core import common as com +from pandas.util.testing import test_parallel + + +class nogil_groupby_count_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + @test_parallel(num_threads=2) + def pg2(): + self.df.groupby('key')['data'].count() + + def time_nogil_groupby_count_2(self): + pg2() + + +class nogil_groupby_last_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + @test_parallel(num_threads=2) + def pg2(): + self.df.groupby('key')['data'].last() + + def time_nogil_groupby_last_2(self): + pg2() + + +class nogil_groupby_max_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + @test_parallel(num_threads=2) + def pg2(): + self.df.groupby('key')['data'].max() + + def time_nogil_groupby_max_2(self): + pg2() + + +class nogil_groupby_mean_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + @test_parallel(num_threads=2) + def pg2(): + self.df.groupby('key')['data'].mean() + + def time_nogil_groupby_mean_2(self): + pg2() + + +class nogil_groupby_min_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + @test_parallel(num_threads=2) + def pg2(): + self.df.groupby('key')['data'].min() + + def time_nogil_groupby_min_2(self): + pg2() + + +class nogil_groupby_prod_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + @test_parallel(num_threads=2) + def pg2(): + self.df.groupby('key')['data'].prod() + + def time_nogil_groupby_prod_2(self): + pg2() + + +class nogil_groupby_sum_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + @test_parallel(num_threads=2) + def pg2(): + self.df.groupby('key')['data'].sum() + + def time_nogil_groupby_sum_2(self): + pg2() + + +class nogil_groupby_sum_4(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + def f(): + self.df.groupby('key')['data'].sum() + + def g2(): + for i in range(2): + f() + + def g4(): + for i in range(4): + f() + + def g8(): + for i in range(8): + f() + + @test_parallel(num_threads=2) + def pg2(): + f() + + @test_parallel(num_threads=4) + def pg4(): + f() + + @test_parallel(num_threads=8) + def pg8(): + f() + + def time_nogil_groupby_sum_4(self): + pg4() + + +class nogil_groupby_sum_8(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + def f(): + self.df.groupby('key')['data'].sum() + + def g2(): + for i in range(2): + f() + + def g4(): + for i in range(4): + f() + + def g8(): + for i in range(8): + f() + + @test_parallel(num_threads=2) + def pg2(): + f() + + @test_parallel(num_threads=4) + def pg4(): + f() + + @test_parallel(num_threads=8) + def pg8(): + f() + + def time_nogil_groupby_sum_8(self): + pg8() + + +class nogil_groupby_var_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + + @test_parallel(num_threads=2) + def pg2(): + self.df.groupby('key')['data'].var() + + def time_nogil_groupby_var_2(self): + pg2() + + +class nogil_take1d_float64(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + self.N = 10000000.0 + self.df = DataFrame({'int64': np.arange(self.N, dtype='int64'), 'float64': np.arange(self.N, dtype='float64'), }) + self.indexer = np.arange(100, (len(self.df) - 100)) + + @test_parallel(num_threads=2) + def take_1d_pg2_int64(): + com.take_1d(self.df.int64.values, self.indexer) + + @test_parallel(num_threads=2) + def take_1d_pg2_float64(): + com.take_1d(self.df.float64.values, self.indexer) + + def time_nogil_take1d_float64(self): + take_1d_pg2_int64() + + +class nogil_take1d_int64(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.ngroups = 1000 + np.random.seed(1234) + self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + self.N = 10000000.0 + self.df = DataFrame({'int64': np.arange(self.N, dtype='int64'), 'float64': np.arange(self.N, dtype='float64'), }) + self.indexer = np.arange(100, (len(self.df) - 100)) + + @test_parallel(num_threads=2) + def take_1d_pg2_int64(): + com.take_1d(self.df.int64.values, self.indexer) + + @test_parallel(num_threads=2) + def take_1d_pg2_float64(): + com.take_1d(self.df.float64.values, self.indexer) + + def time_nogil_take1d_int64(self): + take_1d_pg2_float64() \ No newline at end of file diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py new file mode 100644 index 0000000000000..4f1f4e46b4a31 --- /dev/null +++ b/asv_bench/benchmarks/groupby.py @@ -0,0 +1,1683 @@ +from pandas_vb_common import * +from itertools import product +from string import ascii_letters, digits + + +class groupby_agg_builtins1(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(27182) + self.n = 100000 + self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie']) + + def time_groupby_agg_builtins1(self): + self.df.groupby('jim').agg([sum, min, max]) + + +class groupby_agg_builtins2(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(27182) + self.n = 100000 + self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie']) + + def time_groupby_agg_builtins2(self): + self.df.groupby(['jim', 'joe']).agg([sum, min, max]) + + +class groupby_apply_dict_return(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(1000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.f = (lambda x: {'first': x.values[0], 'last': x.values[(-1)], }) + + def time_groupby_apply_dict_return(self): + self.data.groupby(self.labels).apply(self.f) + + +class groupby_dt_size(object): + goal_time = 0.2 + + def setup(self): + self.n = 100000 + self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') + self.dates = (np.datetime64('now') + self.offsets) + self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + + def time_groupby_dt_size(self): + self.df.groupby(['dates']).size() + + +class groupby_dt_timegrouper_size(object): + goal_time = 0.2 + + def setup(self): + self.n = 100000 + self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') + self.dates = (np.datetime64('now') + self.offsets) + self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + + def time_groupby_dt_timegrouper_size(self): + self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + + +class groupby_first_datetimes(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) + + def time_groupby_first_datetimes(self): + self.df.groupby('b').first() + + +class groupby_first_float32(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) + + def time_groupby_first_float32(self): + self.data2.groupby(self.labels).first() + + +class groupby_first_float64(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) + + def time_groupby_first_float64(self): + self.data.groupby(self.labels).first() + + +class groupby_first_object(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), }) + + def time_groupby_first_object(self): + self.df.groupby('b').first() + + +class groupby_frame_apply(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.labels = np.random.randint(0, 2000, size=self.N) + self.labels2 = np.random.randint(0, 3, size=self.N) + self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), }) + + def f(g): + return 1 + + def time_groupby_frame_apply(self): + self.df.groupby(['key', 'key2']).apply(f) + + +class groupby_frame_apply_overhead(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.labels = np.random.randint(0, 2000, size=self.N) + self.labels2 = np.random.randint(0, 3, size=self.N) + self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), }) + + def f(g): + return 1 + + def time_groupby_frame_apply_overhead(self): + self.df.groupby('key').apply(f) + + +class groupby_frame_cython_many_columns(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.random.randint(0, 100, size=1000) + self.df = DataFrame(randn(1000, 1000)) + + def time_groupby_frame_cython_many_columns(self): + self.df.groupby(self.labels).sum() + + +class groupby_frame_median(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(100000, 2) + self.labels = np.random.randint(0, 1000, size=100000) + self.df = DataFrame(self.data) + + def time_groupby_frame_median(self): + self.df.groupby(self.labels).median() + + +class groupby_frame_nth_any(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) + + def time_groupby_frame_nth_any(self): + self.df.groupby(0).nth(0, dropna='any') + + +class groupby_frame_nth_none(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) + + def time_groupby_frame_nth_none(self): + self.df.groupby(0).nth(0) + + +class groupby_frame_singlekey_integer(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(100000, 1) + self.labels = np.random.randint(0, 1000, size=100000) + self.df = DataFrame(self.data) + + def time_groupby_frame_singlekey_integer(self): + self.df.groupby(self.labels).sum() + + +class groupby_indices(object): + goal_time = 0.2 + + def setup(self): + try: + self.rng = date_range('1/1/2000', '12/31/2005', freq='H') + (year, month, day) = (self.rng.year, self.rng.month, self.rng.day) + except: + self.rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour()) + self.year = self.rng.map((lambda x: x.year)) + self.month = self.rng.map((lambda x: x.month)) + self.day = self.rng.map((lambda x: x.day)) + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + + def time_groupby_indices(self): + len(self.ts.groupby([self.year, self.month, self.day])) + + +class groupby_int64_overflow(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randint(((-1) << 12), (1 << 12), ((1 << 17), 5)) + self.i = np.random.choice(len(self.arr), (len(self.arr) * 5)) + self.arr = np.vstack((self.arr, self.arr[self.i])) + self.i = np.random.permutation(len(self.arr)) + self.arr = self.arr[self.i] + self.df = DataFrame(self.arr, columns=list('abcde')) + (self.df['jim'], self.df['joe']) = (np.random.randn(2, len(self.df)) * 10) + + def time_groupby_int64_overflow(self): + self.df.groupby(list('abcde')).max() + + +class groupby_int_count(object): + goal_time = 0.2 + + def setup(self): + self.n = 10000 + self.df = DataFrame({'key1': randint(0, 500, size=self.n), 'key2': randint(0, 100, size=self.n), 'ints': randint(0, 1000, size=self.n), 'ints2': randint(0, 1000, size=self.n), }) + + def time_groupby_int_count(self): + self.df.groupby(['key1', 'key2']).count() + + +class groupby_last_datetimes(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) + + def time_groupby_last_datetimes(self): + self.df.groupby('b').last() + + +class groupby_last_float32(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) + + def time_groupby_last_float32(self): + self.data2.groupby(self.labels).last() + + +class groupby_last_float64(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) + + def time_groupby_last_float64(self): + self.data.groupby(self.labels).last() + + +class groupby_last_object(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), }) + + def time_groupby_last_object(self): + self.df.groupby('b').last() + + +class groupby_multi_count(object): + goal_time = 0.2 + + def setup(self): + self.n = 10000 + self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') + self.dates = (np.datetime64('now') + self.offsets) + self.dates[(np.random.rand(self.n) > 0.5)] = np.datetime64('nat') + self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat') + self.value2 = np.random.randn(self.n) + self.value2[(np.random.rand(self.n) > 0.5)] = np.nan + self.obj = tm.choice(list('ab'), size=self.n).astype(object) + self.obj[(np.random.randn(self.n) > 0.5)] = np.nan + self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'dates': self.dates, 'value2': self.value2, 'value3': np.random.randn(self.n), 'ints': np.random.randint(0, 1000, size=self.n), 'obj': self.obj, 'offsets': self.offsets, }) + + def time_groupby_multi_count(self): + self.df.groupby(['key1', 'key2']).count() + + +class groupby_multi_cython(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.ngroups = 100 + + def get_test_data(ngroups=100, n=self.N): + self.unique_groups = range(self.ngroups) + self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) + if (len(self.arr) < n): + self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) + random.shuffle(self.arr) + return self.arr + self.df = DataFrame({'key1': get_test_data(ngroups=self.ngroups), 'key2': get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) + + def f(): + self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) + self.simple_series = Series(np.random.randn(self.N)) + self.key1 = self.df['key1'] + + def time_groupby_multi_cython(self): + self.df.groupby(['key1', 'key2']).sum() + + +class groupby_multi_different_functions(object): + goal_time = 0.2 + + def setup(self): + self.fac1 = np.array(['A', 'B', 'C'], dtype='O') + self.fac2 = np.array(['one', 'two'], dtype='O') + self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) + + def time_groupby_multi_different_functions(self): + self.df.groupby(['key1', 'key2']).agg({'value1': 'mean', 'value2': 'var', 'value3': 'sum', }) + + +class groupby_multi_different_numpy_functions(object): + goal_time = 0.2 + + def setup(self): + self.fac1 = np.array(['A', 'B', 'C'], dtype='O') + self.fac2 = np.array(['one', 'two'], dtype='O') + self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) + + def time_groupby_multi_different_numpy_functions(self): + self.df.groupby(['key1', 'key2']).agg({'value1': np.mean, 'value2': np.var, 'value3': np.sum, }) + + +class groupby_multi_index(object): + goal_time = 0.2 + + def setup(self): + self.n = (((5 * 7) * 11) * (1 << 9)) + self.alpha = list(map(''.join, product((ascii_letters + digits), repeat=4))) + self.f = (lambda k: np.repeat(np.random.choice(self.alpha, (self.n // k)), k)) + self.df = DataFrame({'a': self.f(11), 'b': self.f(7), 'c': self.f(5), 'd': self.f(1), }) + self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) + self.i = np.random.permutation(len(self.df)) + self.df = self.df.iloc[self.i].reset_index(drop=True).copy() + + def time_groupby_multi_index(self): + self.df.groupby(list('abcd')).max() + + +class groupby_multi_python(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.ngroups = 100 + + def get_test_data(ngroups=100, n=self.N): + self.unique_groups = range(self.ngroups) + self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) + if (len(self.arr) < n): + self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) + random.shuffle(self.arr) + return self.arr + self.df = DataFrame({'key1': get_test_data(ngroups=self.ngroups), 'key2': get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) + + def f(): + self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) + self.simple_series = Series(np.random.randn(self.N)) + self.key1 = self.df['key1'] + + def time_groupby_multi_python(self): + self.df.groupby(['key1', 'key2'])['data1'].agg((lambda x: x.values.sum())) + + +class groupby_multi_series_op(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.ngroups = 100 + + def get_test_data(ngroups=100, n=self.N): + self.unique_groups = range(self.ngroups) + self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) + if (len(self.arr) < n): + self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) + random.shuffle(self.arr) + return self.arr + self.df = DataFrame({'key1': get_test_data(ngroups=self.ngroups), 'key2': get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) + + def f(): + self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) + self.simple_series = Series(np.random.randn(self.N)) + self.key1 = self.df['key1'] + + def time_groupby_multi_series_op(self): + self.df.groupby(['key1', 'key2'])['data1'].agg(np.std) + + +class groupby_multi_size(object): + goal_time = 0.2 + + def setup(self): + self.n = 100000 + self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') + self.dates = (np.datetime64('now') + self.offsets) + self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + + def time_groupby_multi_size(self): + self.df.groupby(['key1', 'key2']).size() + + +class groupby_ngroups_10000_all(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_all(self): + self.df.groupby('value')['timestamp'].all() + + +class groupby_ngroups_10000_any(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_any(self): + self.df.groupby('value')['timestamp'].any() + + +class groupby_ngroups_10000_count(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_count(self): + self.df.groupby('value')['timestamp'].count() + + +class groupby_ngroups_10000_cumcount(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_cumcount(self): + self.df.groupby('value')['timestamp'].cumcount() + + +class groupby_ngroups_10000_cummax(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_cummax(self): + self.df.groupby('value')['timestamp'].cummax() + + +class groupby_ngroups_10000_cummin(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_cummin(self): + self.df.groupby('value')['timestamp'].cummin() + + +class groupby_ngroups_10000_cumprod(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_cumprod(self): + self.df.groupby('value')['timestamp'].cumprod() + + +class groupby_ngroups_10000_cumsum(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_cumsum(self): + self.df.groupby('value')['timestamp'].cumsum() + + +class groupby_ngroups_10000_describe(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_describe(self): + self.df.groupby('value')['timestamp'].describe() + + +class groupby_ngroups_10000_diff(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_diff(self): + self.df.groupby('value')['timestamp'].diff() + + +class groupby_ngroups_10000_first(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_first(self): + self.df.groupby('value')['timestamp'].first() + + +class groupby_ngroups_10000_head(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_head(self): + self.df.groupby('value')['timestamp'].head() + + +class groupby_ngroups_10000_last(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_last(self): + self.df.groupby('value')['timestamp'].last() + + +class groupby_ngroups_10000_mad(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_mad(self): + self.df.groupby('value')['timestamp'].mad() + + +class groupby_ngroups_10000_max(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_max(self): + self.df.groupby('value')['timestamp'].max() + + +class groupby_ngroups_10000_mean(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_mean(self): + self.df.groupby('value')['timestamp'].mean() + + +class groupby_ngroups_10000_median(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_median(self): + self.df.groupby('value')['timestamp'].median() + + +class groupby_ngroups_10000_min(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_min(self): + self.df.groupby('value')['timestamp'].min() + + +class groupby_ngroups_10000_nunique(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_nunique(self): + self.df.groupby('value')['timestamp'].nunique() + + +class groupby_ngroups_10000_pct_change(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_pct_change(self): + self.df.groupby('value')['timestamp'].pct_change() + + +class groupby_ngroups_10000_prod(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_prod(self): + self.df.groupby('value')['timestamp'].prod() + + +class groupby_ngroups_10000_rank(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_rank(self): + self.df.groupby('value')['timestamp'].rank() + + +class groupby_ngroups_10000_sem(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_sem(self): + self.df.groupby('value')['timestamp'].sem() + + +class groupby_ngroups_10000_size(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_size(self): + self.df.groupby('value')['timestamp'].size() + + +class groupby_ngroups_10000_skew(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_skew(self): + self.df.groupby('value')['timestamp'].skew() + + +class groupby_ngroups_10000_std(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_std(self): + self.df.groupby('value')['timestamp'].std() + + +class groupby_ngroups_10000_sum(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_sum(self): + self.df.groupby('value')['timestamp'].sum() + + +class groupby_ngroups_10000_tail(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_tail(self): + self.df.groupby('value')['timestamp'].tail() + + +class groupby_ngroups_10000_unique(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_unique(self): + self.df.groupby('value')['timestamp'].unique() + + +class groupby_ngroups_10000_value_counts(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_value_counts(self): + self.df.groupby('value')['timestamp'].value_counts() + + +class groupby_ngroups_10000_var(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 10000 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_10000_var(self): + self.df.groupby('value')['timestamp'].var() + + +class groupby_ngroups_100_all(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_all(self): + self.df.groupby('value')['timestamp'].all() + + +class groupby_ngroups_100_any(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_any(self): + self.df.groupby('value')['timestamp'].any() + + +class groupby_ngroups_100_count(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_count(self): + self.df.groupby('value')['timestamp'].count() + + +class groupby_ngroups_100_cumcount(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_cumcount(self): + self.df.groupby('value')['timestamp'].cumcount() + + +class groupby_ngroups_100_cummax(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_cummax(self): + self.df.groupby('value')['timestamp'].cummax() + + +class groupby_ngroups_100_cummin(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_cummin(self): + self.df.groupby('value')['timestamp'].cummin() + + +class groupby_ngroups_100_cumprod(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_cumprod(self): + self.df.groupby('value')['timestamp'].cumprod() + + +class groupby_ngroups_100_cumsum(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_cumsum(self): + self.df.groupby('value')['timestamp'].cumsum() + + +class groupby_ngroups_100_describe(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_describe(self): + self.df.groupby('value')['timestamp'].describe() + + +class groupby_ngroups_100_diff(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_diff(self): + self.df.groupby('value')['timestamp'].diff() + + +class groupby_ngroups_100_first(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_first(self): + self.df.groupby('value')['timestamp'].first() + + +class groupby_ngroups_100_head(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_head(self): + self.df.groupby('value')['timestamp'].head() + + +class groupby_ngroups_100_last(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_last(self): + self.df.groupby('value')['timestamp'].last() + + +class groupby_ngroups_100_mad(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_mad(self): + self.df.groupby('value')['timestamp'].mad() + + +class groupby_ngroups_100_max(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_max(self): + self.df.groupby('value')['timestamp'].max() + + +class groupby_ngroups_100_mean(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_mean(self): + self.df.groupby('value')['timestamp'].mean() + + +class groupby_ngroups_100_median(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_median(self): + self.df.groupby('value')['timestamp'].median() + + +class groupby_ngroups_100_min(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_min(self): + self.df.groupby('value')['timestamp'].min() + + +class groupby_ngroups_100_nunique(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_nunique(self): + self.df.groupby('value')['timestamp'].nunique() + + +class groupby_ngroups_100_pct_change(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_pct_change(self): + self.df.groupby('value')['timestamp'].pct_change() + + +class groupby_ngroups_100_prod(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_prod(self): + self.df.groupby('value')['timestamp'].prod() + + +class groupby_ngroups_100_rank(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_rank(self): + self.df.groupby('value')['timestamp'].rank() + + +class groupby_ngroups_100_sem(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_sem(self): + self.df.groupby('value')['timestamp'].sem() + + +class groupby_ngroups_100_size(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_size(self): + self.df.groupby('value')['timestamp'].size() + + +class groupby_ngroups_100_skew(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_skew(self): + self.df.groupby('value')['timestamp'].skew() + + +class groupby_ngroups_100_std(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_std(self): + self.df.groupby('value')['timestamp'].std() + + +class groupby_ngroups_100_sum(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_sum(self): + self.df.groupby('value')['timestamp'].sum() + + +class groupby_ngroups_100_tail(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_tail(self): + self.df.groupby('value')['timestamp'].tail() + + +class groupby_ngroups_100_unique(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_unique(self): + self.df.groupby('value')['timestamp'].unique() + + +class groupby_ngroups_100_value_counts(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_value_counts(self): + self.df.groupby('value')['timestamp'].value_counts() + + +class groupby_ngroups_100_var(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.ngroups = 100 + self.size = (self.ngroups * 2) + self.rng = np.arange(self.ngroups) + self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + + def time_groupby_ngroups_100_var(self): + self.df.groupby('value')['timestamp'].var() + + +class groupby_nth_datetimes_any(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) + + def time_groupby_nth_datetimes_any(self): + self.df.groupby('b').nth(0, dropna='all') + + +class groupby_nth_datetimes_none(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) + + def time_groupby_nth_datetimes_none(self): + self.df.groupby('b').nth(0) + + +class groupby_nth_float32_any(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) + + def time_groupby_nth_float32_any(self): + self.data2.groupby(self.labels).nth(0, dropna='all') + + +class groupby_nth_float32_none(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) + + def time_groupby_nth_float32_none(self): + self.data2.groupby(self.labels).nth(0) + + +class groupby_nth_float64_any(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) + + def time_groupby_nth_float64_any(self): + self.data.groupby(self.labels).nth(0, dropna='all') + + +class groupby_nth_float64_none(object): + goal_time = 0.2 + + def setup(self): + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) + + def time_groupby_nth_float64_none(self): + self.data.groupby(self.labels).nth(0) + + +class groupby_nth_object_any(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), }) + + def time_groupby_nth_object_any(self): + self.df.groupby('b').nth(0, dropna='any') + + +class groupby_nth_object_none(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), }) + + def time_groupby_nth_object_none(self): + self.df.groupby('b').nth(0) + + +class groupby_pivot_table(object): + goal_time = 0.2 + + def setup(self): + self.fac1 = np.array(['A', 'B', 'C'], dtype='O') + self.fac2 = np.array(['one', 'two'], dtype='O') + self.ind1 = np.random.randint(0, 3, size=100000) + self.ind2 = np.random.randint(0, 2, size=100000) + self.df = DataFrame({'key1': self.fac1.take(self.ind1), 'key2': self.fac2.take(self.ind2), 'key3': self.fac2.take(self.ind2), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) + + def time_groupby_pivot_table(self): + self.df.pivot_table(index='key1', columns=['key2', 'key3']) + + +class groupby_series_nth_any(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) + + def time_groupby_series_nth_any(self): + self.df[1].groupby(self.df[0]).nth(0, dropna='any') + + +class groupby_series_nth_none(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) + + def time_groupby_series_nth_none(self): + self.df[1].groupby(self.df[0]).nth(0) + + +class groupby_series_simple_cython(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.ngroups = 100 + + def get_test_data(ngroups=100, n=self.N): + self.unique_groups = range(self.ngroups) + self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) + if (len(self.arr) < n): + self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) + random.shuffle(self.arr) + return self.arr + self.df = DataFrame({'key1': get_test_data(ngroups=self.ngroups), 'key2': get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) + + def f(): + self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) + self.simple_series = Series(np.random.randn(self.N)) + self.key1 = self.df['key1'] + + def time_groupby_series_simple_cython(self): + self.df.groupby('key1').rank(pct=True) + + +class groupby_simple_compress_timing(object): + goal_time = 0.2 + + def setup(self): + self.data = np.random.randn(1000000, 2) + self.labels = np.random.randint(0, 1000, size=1000000) + self.df = DataFrame(self.data) + + def time_groupby_simple_compress_timing(self): + self.df.groupby(self.labels).mean() + + +class groupby_sum_booleans(object): + goal_time = 0.2 + + def setup(self): + self.N = 500 + self.df = DataFrame({'ii': range(self.N), 'bb': [True for x in range(self.N)], }) + + def time_groupby_sum_booleans(self): + self.df.groupby('ii').sum() + + +class groupby_sum_multiindex(object): + goal_time = 0.2 + + def setup(self): + self.N = 50 + self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B']) + + def time_groupby_sum_multiindex(self): + self.df.groupby(level=[0, 1]).sum() + + +class groupby_transform(object): + goal_time = 0.2 + + def setup(self): + self.n_dates = 400 + self.n_securities = 250 + self.n_columns = 3 + self.share_na = 0.1 + self.dates = date_range('1997-12-31', periods=self.n_dates, freq='B') + self.dates = Index(map((lambda x: (((x.year * 10000) + (x.month * 100)) + x.day)), self.dates)) + self.secid_min = int('10000000', 16) + self.secid_max = int('F0000000', 16) + self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1)) + self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step)) + self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], labels=[[i for i in xrange(self.n_dates) for _ in xrange(self.n_securities)], (range(self.n_securities) * self.n_dates)], names=['date', 'security_id']) + self.n_data = len(self.data_index) + self.columns = Index(['factor{}'.format(i) for i in xrange(1, (self.n_columns + 1))]) + self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns) + self.step = int((self.n_data * self.share_na)) + for column_index in xrange(self.n_columns): + self.index = column_index + while (self.index < self.n_data): + self.data.set_value(self.data_index[self.index], self.columns[column_index], np.nan) + self.index += self.step + self.f_fillna = (lambda x: x.fillna(method='pad')) + + def time_groupby_transform(self): + self.data.groupby(level='security_id').transform(self.f_fillna) + + +class groupby_transform_multi_key1(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(2718281) + self.n = 20000 + self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie']) + + def time_groupby_transform_multi_key1(self): + self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + + +class groupby_transform_multi_key2(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(2718281) + self.n = 20000 + self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie']) + self.df['jim'] = self.df['joe'] + + def time_groupby_transform_multi_key2(self): + self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + + +class groupby_transform_multi_key3(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(2718281) + self.n = 200000 + self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie']) + + def time_groupby_transform_multi_key3(self): + self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + + +class groupby_transform_multi_key4(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(2718281) + self.n = 200000 + self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie']) + self.df['jim'] = self.df['joe'] + + def time_groupby_transform_multi_key4(self): + self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + + +class groupby_transform_series(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(0) + self.N = 120000 + self.N_TRANSITIONS = 1400 + self.transition_points = np.random.permutation(np.arange(self.N))[:self.N_TRANSITIONS] + self.transition_points.sort() + self.transitions = np.zeros((self.N,), dtype=np.bool) + self.transitions[self.transition_points] = True + self.g = self.transitions.cumsum() + self.df = DataFrame({'signal': np.random.rand(self.N), }) + + def time_groupby_transform_series(self): + self.df['signal'].groupby(self.g).transform(np.mean) + + +class groupby_transform_series2(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(0) + self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), }) + + def time_groupby_transform_series2(self): + self.df.groupby('id')['val'].transform(np.mean) + + +class groupby_transform_ufunc(object): + goal_time = 0.2 + + def setup(self): + self.n_dates = 400 + self.n_securities = 250 + self.n_columns = 3 + self.share_na = 0.1 + self.dates = date_range('1997-12-31', periods=self.n_dates, freq='B') + self.dates = Index(map((lambda x: (((x.year * 10000) + (x.month * 100)) + x.day)), self.dates)) + self.secid_min = int('10000000', 16) + self.secid_max = int('F0000000', 16) + self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1)) + self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step)) + self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], labels=[[i for i in xrange(self.n_dates) for _ in xrange(self.n_securities)], (range(self.n_securities) * self.n_dates)], names=['date', 'security_id']) + self.n_data = len(self.data_index) + self.columns = Index(['factor{}'.format(i) for i in xrange(1, (self.n_columns + 1))]) + self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns) + self.step = int((self.n_data * self.share_na)) + for column_index in xrange(self.n_columns): + self.index = column_index + while (self.index < self.n_data): + self.data.set_value(self.data_index[self.index], self.columns[column_index], np.nan) + self.index += self.step + self.f_fillna = (lambda x: x.fillna(method='pad')) + + def time_groupby_transform_ufunc(self): + self.data.groupby(level='date').transform(np.max) + + +class series_value_counts_int64(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randint(0, 1000, size=100000)) + + def time_series_value_counts_int64(self): + self.s.value_counts() + + +class series_value_counts_strings(object): + goal_time = 0.2 + + def setup(self): + self.K = 1000 + self.N = 100000 + self.uniques = tm.makeStringIndex(self.K).values + self.s = Series(np.tile(self.uniques, (self.N // self.K))) + + def time_series_value_counts_strings(self): + self.s.value_counts() \ No newline at end of file diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py new file mode 100644 index 0000000000000..9e36f735f8608 --- /dev/null +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -0,0 +1,351 @@ +from pandas_vb_common import * +import os + + +class query_store_table(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = date_range('1/1/2000', periods=25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + self.store.append('df12', self.df) + + def time_query_store_table(self): + self.store.select('df12', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])]) + + def teardown(self): + self.store.close() + + +class query_store_table_wide(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = date_range('1/1/2000', periods=25000) + self.df = DataFrame(np.random.randn(25000, 100), index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + self.store.append('df11', self.df) + + def time_query_store_table_wide(self): + self.store.select('df11', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])]) + + def teardown(self): + self.store.close() + + +class read_store(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + self.store.put('df1', self.df) + + def time_read_store(self): + self.store.get('df1') + + def teardown(self): + self.store.close() + + +class read_store_mixed(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + self.store.put('df3', self.df) + + def time_read_store_mixed(self): + self.store.get('df3') + + def teardown(self): + self.store.close() + + +class read_store_table(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + self.store.append('df7', self.df) + + def time_read_store_table(self): + self.store.select('df7') + + def teardown(self): + self.store.close() + + +class read_store_table_mixed(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 10000 + self.index = tm.makeStringIndex(self.N) + self.df = DataFrame({'float1': randn(self.N), 'float2': randn(self.N), 'string1': (['foo'] * self.N), 'bool1': ([True] * self.N), 'int1': np.random.randint(0, self.N, size=self.N), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + self.store.append('df5', self.df) + + def time_read_store_table_mixed(self): + self.store.select('df5') + + def teardown(self): + self.store.close() + + +class read_store_table_panel(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in xrange(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in xrange(25)]) + remove(self.f) + self.store = HDFStore(self.f) + self.store.append('p1', self.p) + + def time_read_store_table_panel(self): + self.store.select('p1') + + def teardown(self): + self.store.close() + + +class read_store_table_wide(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.df = DataFrame(np.random.randn(25000, 100)) + remove(self.f) + self.store = HDFStore(self.f) + self.store.append('df9', self.df) + + def time_read_store_table_wide(self): + self.store.select('df9') + + def teardown(self): + self.store.close() + + +class write_store(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + + def time_write_store(self): + self.store.put('df2', self.df) + + def teardown(self): + self.store.close() + + +class write_store_mixed(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + + def time_write_store_mixed(self): + self.store.put('df4', self.df) + + def teardown(self): + self.store.close() + + +class write_store_table(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + + def time_write_store_table(self): + self.store.append('df8', self.df) + + def teardown(self): + self.store.close() + + +class write_store_table_dc(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.df = DataFrame(np.random.randn(10000, 10), columns=[('C%03d' % i) for i in xrange(10)]) + remove(self.f) + self.store = HDFStore(self.f) + + def time_write_store_table_dc(self): + self.store.append('df15', self.df, data_columns=True) + + def teardown(self): + self.store.close() + + +class write_store_table_mixed(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 25000, size=25000), }, index=self.index) + remove(self.f) + self.store = HDFStore(self.f) + + def time_write_store_table_mixed(self): + self.store.append('df6', self.df) + + def teardown(self): + self.store.close() + + +class write_store_table_panel(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in xrange(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in xrange(25)]) + remove(self.f) + self.store = HDFStore(self.f) + + def time_write_store_table_panel(self): + self.store.append('p2', self.p) + + def teardown(self): + self.store.close() + + +class write_store_table_wide(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.h5' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.df = DataFrame(np.random.randn(25000, 100)) + remove(self.f) + self.store = HDFStore(self.f) + + def time_write_store_table_wide(self): + self.store.append('df10', self.df) + + def teardown(self): + self.store.close() \ No newline at end of file diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py new file mode 100644 index 0000000000000..9c181c92195ea --- /dev/null +++ b/asv_bench/benchmarks/index_object.py @@ -0,0 +1,292 @@ +from pandas_vb_common import * + + +class datetime_index_intersection(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=10000, freq='T') + self.rng2 = self.rng[:(-1)] + + def time_datetime_index_intersection(self): + self.rng.intersection(self.rng2) + + +class datetime_index_repr(object): + goal_time = 0.2 + + def setup(self): + self.dr = pd.date_range('20000101', freq='D', periods=100000) + + def time_datetime_index_repr(self): + self.dr._is_dates_only + + +class datetime_index_union(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=10000, freq='T') + self.rng2 = self.rng[:(-1)] + + def time_datetime_index_union(self): + self.rng.union(self.rng2) + + +class index_datetime_intersection(object): + goal_time = 0.2 + + def setup(self): + self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) + if (self.rng.dtype == object): + self.rng = self.rng.view(Index) + else: + self.rng = self.rng.asobject + self.rng2 = self.rng[:(-1)] + + def time_index_datetime_intersection(self): + self.rng.intersection(self.rng2) + + +class index_datetime_union(object): + goal_time = 0.2 + + def setup(self): + self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) + if (self.rng.dtype == object): + self.rng = self.rng.view(Index) + else: + self.rng = self.rng.asobject + self.rng2 = self.rng[:(-1)] + + def time_index_datetime_union(self): + self.rng.union(self.rng2) + + +class index_float64_boolean_indexer(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeFloatIndex(1000000) + self.mask = ((np.arange(self.idx.size) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_float64_boolean_indexer(self): + self.idx[self.mask] + + +class index_float64_boolean_series_indexer(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeFloatIndex(1000000) + self.mask = ((np.arange(self.idx.size) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_float64_boolean_series_indexer(self): + self.idx[self.series_mask] + + +class index_float64_construct(object): + goal_time = 0.2 + + def setup(self): + self.baseidx = np.arange(1000000.0) + + def time_index_float64_construct(self): + Index(self.baseidx) + + +class index_float64_div(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeFloatIndex(1000000) + self.mask = ((np.arange(self.idx.size) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_float64_div(self): + (self.idx / 2) + + +class index_float64_get(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeFloatIndex(1000000) + self.mask = ((np.arange(self.idx.size) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_float64_get(self): + self.idx[1] + + +class index_float64_mul(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeFloatIndex(1000000) + self.mask = ((np.arange(self.idx.size) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_float64_mul(self): + (self.idx * 2) + + +class index_float64_slice_indexer_basic(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeFloatIndex(1000000) + self.mask = ((np.arange(self.idx.size) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_float64_slice_indexer_basic(self): + self.idx[:(-1)] + + +class index_float64_slice_indexer_even(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeFloatIndex(1000000) + self.mask = ((np.arange(self.idx.size) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_float64_slice_indexer_even(self): + self.idx[::2] + + +class index_int64_intersection(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.options = np.arange(self.N) + self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + + def time_index_int64_intersection(self): + self.left.intersection(self.right) + + +class index_int64_union(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.options = np.arange(self.N) + self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + + def time_index_int64_union(self): + self.left.union(self.right) + + +class index_str_boolean_indexer(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeStringIndex(1000000) + self.mask = ((np.arange(1000000) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_str_boolean_indexer(self): + self.idx[self.mask] + + +class index_str_boolean_series_indexer(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeStringIndex(1000000) + self.mask = ((np.arange(1000000) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_str_boolean_series_indexer(self): + self.idx[self.series_mask] + + +class index_str_slice_indexer_basic(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeStringIndex(1000000) + self.mask = ((np.arange(1000000) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_str_slice_indexer_basic(self): + self.idx[:(-1)] + + +class index_str_slice_indexer_even(object): + goal_time = 0.2 + + def setup(self): + self.idx = tm.makeStringIndex(1000000) + self.mask = ((np.arange(1000000) % 3) == 0) + self.series_mask = Series(self.mask) + + def time_index_str_slice_indexer_even(self): + self.idx[::2] + + +class multiindex_duplicated(object): + goal_time = 0.2 + + def setup(self): + (n, k) = (200, 5000) + self.levels = [np.arange(n), tm.makeStringIndex(n).values, (1000 + np.arange(n))] + self.labels = [np.random.choice(n, (k * n)) for lev in self.levels] + self.mi = MultiIndex(levels=self.levels, labels=self.labels) + + def time_multiindex_duplicated(self): + self.mi.duplicated() + + +class multiindex_from_product(object): + goal_time = 0.2 + + def setup(self): + self.iterables = [tm.makeStringIndex(10000), xrange(20)] + + def time_multiindex_from_product(self): + MultiIndex.from_product(self.iterables) + + +class multiindex_sortlevel_int64(object): + goal_time = 0.2 + + def setup(self): + self.n = ((((3 * 5) * 7) * 11) * (1 << 10)) + (low, high) = (((-1) << 12), (1 << 12)) + self.f = (lambda k: np.repeat(np.random.randint(low, high, (self.n // k)), k)) + self.i = np.random.permutation(self.n) + self.mi = MultiIndex.from_arrays([self.f(11), self.f(7), self.f(5), self.f(3), self.f(1)])[self.i] + + def time_multiindex_sortlevel_int64(self): + self.mi.sortlevel() + + +class multiindex_with_datetime_level_full(object): + goal_time = 0.2 + + def setup(self): + self.level1 = range(1000) + self.level2 = date_range(start='1/1/2012', periods=100) + self.mi = MultiIndex.from_product([self.level1, self.level2]) + + def time_multiindex_with_datetime_level_full(self): + self.mi.copy().values + + +class multiindex_with_datetime_level_sliced(object): + goal_time = 0.2 + + def setup(self): + self.level1 = range(1000) + self.level2 = date_range(start='1/1/2012', periods=100) + self.mi = MultiIndex.from_product([self.level1, self.level2]) + + def time_multiindex_with_datetime_level_sliced(self): + self.mi[:10].values \ No newline at end of file diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py new file mode 100644 index 0000000000000..e76a87ab881c9 --- /dev/null +++ b/asv_bench/benchmarks/indexing.py @@ -0,0 +1,458 @@ +from pandas_vb_common import * +import pandas.computation.expressions as expr + + +class dataframe_getitem_scalar(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(1000) + self.columns = tm.makeStringIndex(30) + self.df = DataFrame(np.random.rand(1000, 30), index=self.index, columns=self.columns) + self.idx = self.index[100] + self.col = self.columns[10] + + def time_dataframe_getitem_scalar(self): + self.df[self.col][self.idx] + + +class datamatrix_getitem_scalar(object): + goal_time = 0.2 + + def setup(self): + try: + self.klass = DataMatrix + except: + self.klass = DataFrame + self.index = tm.makeStringIndex(1000) + self.columns = tm.makeStringIndex(30) + self.df = self.klass(np.random.rand(1000, 30), index=self.index, columns=self.columns) + self.idx = self.index[100] + self.col = self.columns[10] + + def time_datamatrix_getitem_scalar(self): + self.df[self.col][self.idx] + + +class series_get_value(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(1000) + self.s = Series(np.random.rand(1000), index=self.index) + self.idx = self.index[100] + + def time_series_get_value(self): + self.s.get_value(self.idx) + + +class time_series_getitem_scalar(object): + goal_time = 0.2 + + def setup(self): + tm.N = 1000 + self.ts = tm.makeTimeSeries() + self.dt = self.ts.index[500] + + def time_time_series_getitem_scalar(self): + self.ts[self.dt] + + +class frame_iloc_big(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(dict(A=(['foo'] * 1000000))) + + def time_frame_iloc_big(self): + self.df.iloc[:100, 0] + + +class frame_iloc_dups(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), }) + self.idx = (np.array(range(30)) * 99) + self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), }) + self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)]) + + def time_frame_iloc_dups(self): + self.df2.iloc[self.idx] + + +class frame_loc_dups(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), }) + self.idx = (np.array(range(30)) * 99) + self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), }) + self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)]) + + def time_frame_loc_dups(self): + self.df2.loc[self.idx] + + +class frame_xs_mi_ix(object): + goal_time = 0.2 + + def setup(self): + self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) + self.s = Series(np.random.randn(1000000), index=self.mi) + self.df = DataFrame(self.s) + + def time_frame_xs_mi_ix(self): + self.df.ix[999] + + +class indexing_dataframe_boolean(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(50000, 100)) + self.df2 = DataFrame(np.random.randn(50000, 100)) + + def time_indexing_dataframe_boolean(self): + (self.df > self.df2) + + +class indexing_dataframe_boolean_no_ne(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(50000, 100)) + self.df2 = DataFrame(np.random.randn(50000, 100)) + expr.set_use_numexpr(False) + + def time_indexing_dataframe_boolean_no_ne(self): + (self.df > self.df2) + + def teardown(self): + expr.set_use_numexpr(True) + + +class indexing_dataframe_boolean_rows(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) + self.indexer = (self.df['B'] > 0) + self.obj_indexer = self.indexer.astype('O') + + def time_indexing_dataframe_boolean_rows(self): + self.df[self.indexer] + + +class indexing_dataframe_boolean_rows_object(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) + self.indexer = (self.df['B'] > 0) + self.obj_indexer = self.indexer.astype('O') + + def time_indexing_dataframe_boolean_rows_object(self): + self.df[self.obj_indexer] + + +class indexing_dataframe_boolean_st(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(50000, 100)) + self.df2 = DataFrame(np.random.randn(50000, 100)) + expr.set_numexpr_threads(1) + + def time_indexing_dataframe_boolean_st(self): + (self.df > self.df2) + + def teardown(self): + expr.set_numexpr_threads() + + +class indexing_frame_get_value(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(1000) + self.columns = tm.makeStringIndex(30) + self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns) + self.idx = self.index[100] + self.col = self.columns[10] + + def time_indexing_frame_get_value(self): + self.df.get_value(self.idx, self.col) + + +class indexing_frame_get_value_ix(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(1000) + self.columns = tm.makeStringIndex(30) + self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns) + self.idx = self.index[100] + self.col = self.columns[10] + + def time_indexing_frame_get_value_ix(self): + self.df.ix[(self.idx, self.col)] + + +class indexing_panel_subset(object): + goal_time = 0.2 + + def setup(self): + self.p = Panel(np.random.randn(100, 100, 100)) + self.inds = range(0, 100, 10) + + def time_indexing_panel_subset(self): + self.p.ix[(self.inds, self.inds, self.inds)] + + +class multiindex_slicers(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.idx = pd.IndexSlice + self.n = 100000 + self.mdt = pandas.DataFrame() + self.mdt['A'] = np.random.choice(range(10000, 45000, 1000), self.n) + self.mdt['B'] = np.random.choice(range(10, 400), self.n) + self.mdt['C'] = np.random.choice(range(1, 150), self.n) + self.mdt['D'] = np.random.choice(range(10000, 45000), self.n) + self.mdt['x'] = np.random.choice(range(400), self.n) + self.mdt['y'] = np.random.choice(range(25), self.n) + self.test_A = 25000 + self.test_B = 25 + self.test_C = 40 + self.test_D = 35000 + self.eps_A = 5000 + self.eps_B = 5 + self.eps_C = 5 + self.eps_D = 5000 + self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() + + def time_multiindex_slicers(self): + self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + + +class series_getitem_array(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_getitem_array(self): + self.s[np.arange(10000)] + + +class series_getitem_label_slice(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(1000000) + self.s = Series(np.random.rand(1000000), index=self.index) + self.lbl = self.s.index[800000] + + def time_series_getitem_label_slice(self): + self.s[:self.lbl] + + +class series_getitem_list_like(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_getitem_list_like(self): + self.s[[800000]] + + +class series_getitem_pos_slice(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(1000000) + self.s = Series(np.random.rand(1000000), index=self.index) + + def time_series_getitem_pos_slice(self): + self.s[:800000] + + +class series_getitem_scalar(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_getitem_scalar(self): + self.s[800000] + + +class series_getitem_slice(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_getitem_slice(self): + self.s[:800000] + + +class series_iloc_array(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_iloc_array(self): + self.s.iloc[np.arange(10000)] + + +class series_iloc_list_like(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_iloc_list_like(self): + self.s.iloc[[800000]] + + +class series_iloc_scalar(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_iloc_scalar(self): + self.s.iloc[800000] + + +class series_iloc_slice(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_iloc_slice(self): + self.s.iloc[:800000] + + +class series_ix_array(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_ix_array(self): + self.s.ix[np.arange(10000)] + + +class series_ix_list_like(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_ix_list_like(self): + self.s.ix[[800000]] + + +class series_ix_scalar(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_ix_scalar(self): + self.s.ix[800000] + + +class series_ix_slice(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_ix_slice(self): + self.s.ix[:800000] + + +class series_loc_array(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_loc_array(self): + self.s.loc[np.arange(10000)] + + +class series_loc_list_like(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_loc_list_like(self): + self.s.loc[[800000]] + + +class series_loc_scalar(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_loc_scalar(self): + self.s.loc[800000] + + +class series_loc_slice(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.rand(1000000)) + + def time_series_loc_slice(self): + self.s.loc[:800000] + + +class series_xs_mi_ix(object): + goal_time = 0.2 + + def setup(self): + self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) + self.s = Series(np.random.randn(1000000), index=self.mi) + + def time_series_xs_mi_ix(self): + self.s.ix[999] + + +class sort_level_one(object): + goal_time = 0.2 + + def setup(self): + self.a = np.repeat(np.arange(100), 1000) + self.b = np.tile(np.arange(1000), 100) + self.midx = MultiIndex.from_arrays([self.a, self.b]) + self.midx = self.midx.take(np.random.permutation(np.arange(100000))) + + def time_sort_level_one(self): + self.midx.sortlevel(1) + + +class sort_level_zero(object): + goal_time = 0.2 + + def setup(self): + self.a = np.repeat(np.arange(100), 1000) + self.b = np.tile(np.arange(1000), 100) + self.midx = MultiIndex.from_arrays([self.a, self.b]) + self.midx = self.midx.take(np.random.permutation(np.arange(100000))) + + def time_sort_level_zero(self): + self.midx.sortlevel(0) \ No newline at end of file diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py new file mode 100644 index 0000000000000..2addc810a218f --- /dev/null +++ b/asv_bench/benchmarks/inference.py @@ -0,0 +1,138 @@ +from pandas_vb_common import * +import pandas as pd + + +class dtype_infer_datetime64(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + + def time_dtype_infer_datetime64(self): + (self.df_datetime64['A'] - self.df_datetime64['B']) + + +class dtype_infer_float32(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + + def time_dtype_infer_float32(self): + (self.df_float32['A'] + self.df_float32['B']) + + +class dtype_infer_float64(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + + def time_dtype_infer_float64(self): + (self.df_float64['A'] + self.df_float64['B']) + + +class dtype_infer_int32(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + + def time_dtype_infer_int32(self): + (self.df_int32['A'] + self.df_int32['B']) + + +class dtype_infer_int64(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + + def time_dtype_infer_int64(self): + (self.df_int64['A'] + self.df_int64['B']) + + +class dtype_infer_timedelta64_1(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + + def time_dtype_infer_timedelta64_1(self): + (self.df_timedelta64['A'] + self.df_timedelta64['B']) + + +class dtype_infer_timedelta64_2(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + + def time_dtype_infer_timedelta64_2(self): + (self.df_timedelta64['A'] + self.df_timedelta64['A']) + + +class dtype_infer_uint32(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) + + def time_dtype_infer_uint32(self): + (self.df_uint32['A'] + self.df_uint32['B']) \ No newline at end of file diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py new file mode 100644 index 0000000000000..9eee932de8b7c --- /dev/null +++ b/asv_bench/benchmarks/io_bench.py @@ -0,0 +1,135 @@ +from pandas_vb_common import * +from pandas import concat, Timestamp +from StringIO import StringIO + + +class frame_to_csv(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(3000, 30)) + + def time_frame_to_csv(self): + self.df.to_csv('__test__.csv') + + +class frame_to_csv2(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame({'A': range(50000), }) + self.df['B'] = (self.df.A + 1.0) + self.df['C'] = (self.df.A + 2.0) + self.df['D'] = (self.df.A + 3.0) + + def time_frame_to_csv2(self): + self.df.to_csv('__test__.csv') + + +class frame_to_csv_date_formatting(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=1000) + self.data = DataFrame(self.rng, index=self.rng) + + def time_frame_to_csv_date_formatting(self): + self.data.to_csv('__test__.csv', date_format='%Y%m%d') + + +class frame_to_csv_mixed(object): + goal_time = 0.2 + + def setup(self): + + def create_cols(name): + return [('%s%03d' % (name, i)) for i in xrange(5)] + self.df_float = DataFrame(np.random.randn(5000, 5), dtype='float64', columns=create_cols('float')) + self.df_int = DataFrame(np.random.randn(5000, 5), dtype='int64', columns=create_cols('int')) + self.df_bool = DataFrame(True, index=self.df_float.index, columns=create_cols('bool')) + self.df_object = DataFrame('foo', index=self.df_float.index, columns=create_cols('object')) + self.df_dt = DataFrame(Timestamp('20010101'), index=self.df_float.index, columns=create_cols('date')) + self.df_float.ix[30:500, 1:3] = np.nan + self.df = concat([self.df_float, self.df_int, self.df_bool, self.df_object, self.df_dt], axis=1) + + def time_frame_to_csv_mixed(self): + self.df.to_csv('__test__.csv') + + +class read_csv_infer_datetime_format_custom(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=1000) + self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%m/%d/%Y %H:%M:%S.%f')))) + + def time_read_csv_infer_datetime_format_custom(self): + read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) + + +class read_csv_infer_datetime_format_iso8601(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=1000) + self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S')))) + + def time_read_csv_infer_datetime_format_iso8601(self): + read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) + + +class read_csv_infer_datetime_format_ymd(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=1000) + self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y%m%d')))) + + def time_read_csv_infer_datetime_format_ymd(self): + read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) + + +class read_csv_skiprows(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(20000) + self.df = DataFrame({'float1': randn(20000), 'float2': randn(20000), 'string1': (['foo'] * 20000), 'bool1': ([True] * 20000), 'int1': np.random.randint(0, 200000, size=20000), }, index=self.index) + self.df.to_csv('__test__.csv') + + def time_read_csv_skiprows(self): + read_csv('__test__.csv', skiprows=10000) + + +class read_csv_standard(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + self.df.to_csv('__test__.csv') + + def time_read_csv_standard(self): + read_csv('__test__.csv') + + +class read_parse_dates_iso8601(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=1000) + self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S')))) + + def time_read_parse_dates_iso8601(self): + read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo']) + + +class write_csv_standard(object): + goal_time = 0.2 + + def setup(self): + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + + def time_write_csv_standard(self): + self.df.to_csv('__test__.csv') \ No newline at end of file diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py new file mode 100644 index 0000000000000..e75e691b61c96 --- /dev/null +++ b/asv_bench/benchmarks/io_sql.py @@ -0,0 +1,215 @@ +from pandas_vb_common import * +from sqlalchemy import create_engine +import sqlite3 +import sqlalchemy + + +class sql_datetime_read_and_parse_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df['datetime_string'] = self.df['datetime'].map(str) + self.df.to_sql('test_type', self.engine, if_exists='replace') + self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') + + def time_sql_datetime_read_and_parse_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string']) + + +class sql_datetime_read_as_native_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df['datetime_string'] = self.df['datetime'].map(str) + self.df.to_sql('test_type', self.engine, if_exists='replace') + self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') + + def time_sql_datetime_read_as_native_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['datetime']) + + +class sql_datetime_write_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df.loc[1000:3000, 'float'] = np.nan + + def time_sql_datetime_write_sqlalchemy(self): + self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace') + + +class sql_float_read_query_fallback(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df['datetime_string'] = self.df['datetime'].map(str) + self.df.to_sql('test_type', self.engine, if_exists='replace') + self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') + + def time_sql_float_read_query_fallback(self): + read_sql_query('SELECT float FROM test_type', self.con) + + +class sql_float_read_query_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df['datetime_string'] = self.df['datetime'].map(str) + self.df.to_sql('test_type', self.engine, if_exists='replace') + self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') + + def time_sql_float_read_query_sqlalchemy(self): + read_sql_query('SELECT float FROM test_type', self.engine) + + +class sql_float_read_table_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df['datetime_string'] = self.df['datetime'].map(str) + self.df.to_sql('test_type', self.engine, if_exists='replace') + self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') + + def time_sql_float_read_table_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['float']) + + +class sql_float_write_fallback(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df.loc[1000:3000, 'float'] = np.nan + + def time_sql_float_write_fallback(self): + self.df[['float']].to_sql('test_float', self.con, if_exists='replace') + + +class sql_float_write_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df.loc[1000:3000, 'float'] = np.nan + + def time_sql_float_write_sqlalchemy(self): + self.df[['float']].to_sql('test_float', self.engine, if_exists='replace') + + +class sql_read_query_fallback(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + self.df.to_sql('test2', self.engine, if_exists='replace') + self.df.to_sql('test2', self.con, if_exists='replace') + + def time_sql_read_query_fallback(self): + read_sql_query('SELECT * FROM test2', self.con) + + +class sql_read_query_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + self.df.to_sql('test2', self.engine, if_exists='replace') + self.df.to_sql('test2', self.con, if_exists='replace') + + def time_sql_read_query_sqlalchemy(self): + read_sql_query('SELECT * FROM test2', self.engine) + + +class sql_read_table_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + self.df.to_sql('test2', self.engine, if_exists='replace') + self.df.to_sql('test2', self.con, if_exists='replace') + + def time_sql_read_table_sqlalchemy(self): + read_sql_table('test2', self.engine) + + +class sql_string_write_fallback(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df.loc[1000:3000, 'float'] = np.nan + + def time_sql_string_write_fallback(self): + self.df[['string']].to_sql('test_string', self.con, if_exists='replace') + + +class sql_string_write_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df.loc[1000:3000, 'float'] = np.nan + + def time_sql_string_write_sqlalchemy(self): + self.df[['string']].to_sql('test_string', self.engine, if_exists='replace') + + +class sql_write_fallback(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + + def time_sql_write_fallback(self): + self.df.to_sql('test1', self.con, if_exists='replace') + + +class sql_write_sqlalchemy(object): + goal_time = 0.2 + + def setup(self): + self.engine = create_engine('sqlite:///:memory:') + self.con = sqlite3.connect(':memory:') + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + + def time_sql_write_sqlalchemy(self): + self.df.to_sql('test1', self.engine, if_exists='replace') \ No newline at end of file diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py new file mode 100644 index 0000000000000..08ae439e8fd5d --- /dev/null +++ b/asv_bench/benchmarks/join_merge.py @@ -0,0 +1,359 @@ +from pandas_vb_common import * + + +class append_frame_single_homogenous(object): + goal_time = 0.2 + + def setup(self): + self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) + self.df2 = self.df1.copy() + self.df2.index = np.arange(10000, 20000) + self.mdf1 = self.df1.copy() + self.mdf1['obj1'] = 'bar' + self.mdf1['obj2'] = 'bar' + self.mdf1['int1'] = 5 + try: + self.mdf1.consolidate(inplace=True) + except: + pass + self.mdf2 = self.mdf1.copy() + self.mdf2.index = self.df2.index + + def time_append_frame_single_homogenous(self): + self.df1.append(self.df2) + + +class append_frame_single_mixed(object): + goal_time = 0.2 + + def setup(self): + self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) + self.df2 = self.df1.copy() + self.df2.index = np.arange(10000, 20000) + self.mdf1 = self.df1.copy() + self.mdf1['obj1'] = 'bar' + self.mdf1['obj2'] = 'bar' + self.mdf1['int1'] = 5 + try: + self.mdf1.consolidate(inplace=True) + except: + pass + self.mdf2 = self.mdf1.copy() + self.mdf2.index = self.df2.index + + def time_append_frame_single_mixed(self): + self.mdf1.append(self.mdf2) + + +class concat_empty_frames1(object): + goal_time = 0.2 + + def setup(self): + self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) + self.empty = pd.DataFrame() + + def time_concat_empty_frames1(self): + concat([self.df, self.empty]) + + +class concat_empty_frames2(object): + goal_time = 0.2 + + def setup(self): + self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) + self.empty = pd.DataFrame() + + def time_concat_empty_frames2(self): + concat([self.empty, self.df]) + + +class concat_series_axis1(object): + goal_time = 0.2 + + def setup(self): + self.n = 1000 + self.indices = tm.makeStringIndex(1000) + self.s = Series(self.n, index=self.indices) + self.pieces = [self.s[i:(- i)] for i in range(1, 10)] + self.pieces = (self.pieces * 50) + + def time_concat_series_axis1(self): + concat(self.pieces, axis=1) + + +class concat_small_frames(object): + goal_time = 0.2 + + def setup(self): + self.df = pd.DataFrame(randn(5, 4)) + + def time_concat_small_frames(self): + concat(([self.df] * 1000)) + + +class i8merge(object): + goal_time = 0.2 + + def setup(self): + (low, high, n) = (((-1) << 10), (1 << 10), (1 << 20)) + self.left = pd.DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) + self.left['left'] = self.left.sum(axis=1) + self.i = np.random.permutation(len(self.left)) + self.right = self.left.iloc[self.i].copy() + self.right.columns = (self.right.columns[:(-1)].tolist() + ['right']) + self.right.index = np.arange(len(self.right)) + self.right['right'] *= (-1) + + def time_i8merge(self): + merge(self.left, self.right, how='outer') + + +class join_dataframe_index_multi(object): + goal_time = 0.2 + + def setup(self): + self.level1 = tm.makeStringIndex(10).values + self.level2 = tm.makeStringIndex(1000).values + self.label1 = np.arange(10).repeat(1000) + self.label2 = np.tile(np.arange(1000), 10) + self.key1 = np.tile(self.level1.take(self.label1), 10) + self.key2 = np.tile(self.level2.take(self.label2), 10) + self.shuf = np.arange(100000) + random.shuffle(self.shuf) + try: + self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) + self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) + except: + pass + try: + self.DataFrame = DataMatrix + except: + pass + self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) + self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) + self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) + self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + + def time_join_dataframe_index_multi(self): + self.df.join(self.df_multi, on=['key1', 'key2']) + + +class join_dataframe_index_single_key_bigger(object): + goal_time = 0.2 + + def setup(self): + self.level1 = tm.makeStringIndex(10).values + self.level2 = tm.makeStringIndex(1000).values + self.label1 = np.arange(10).repeat(1000) + self.label2 = np.tile(np.arange(1000), 10) + self.key1 = np.tile(self.level1.take(self.label1), 10) + self.key2 = np.tile(self.level2.take(self.label2), 10) + self.shuf = np.arange(100000) + random.shuffle(self.shuf) + try: + self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) + self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) + except: + pass + try: + self.DataFrame = DataMatrix + except: + pass + self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) + self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) + self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) + self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + + def time_join_dataframe_index_single_key_bigger(self): + self.df.join(self.df_key2, on='key2') + + +class join_dataframe_index_single_key_bigger_sort(object): + goal_time = 0.2 + + def setup(self): + self.level1 = tm.makeStringIndex(10).values + self.level2 = tm.makeStringIndex(1000).values + self.label1 = np.arange(10).repeat(1000) + self.label2 = np.tile(np.arange(1000), 10) + self.key1 = np.tile(self.level1.take(self.label1), 10) + self.key2 = np.tile(self.level2.take(self.label2), 10) + self.shuf = np.arange(100000) + random.shuffle(self.shuf) + try: + self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) + self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) + except: + pass + try: + self.DataFrame = DataMatrix + except: + pass + self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) + self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) + self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) + self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + + def time_join_dataframe_index_single_key_bigger_sort(self): + self.df_shuf.join(self.df_key2, on='key2', sort=True) + + +class join_dataframe_index_single_key_small(object): + goal_time = 0.2 + + def setup(self): + self.level1 = tm.makeStringIndex(10).values + self.level2 = tm.makeStringIndex(1000).values + self.label1 = np.arange(10).repeat(1000) + self.label2 = np.tile(np.arange(1000), 10) + self.key1 = np.tile(self.level1.take(self.label1), 10) + self.key2 = np.tile(self.level2.take(self.label2), 10) + self.shuf = np.arange(100000) + random.shuffle(self.shuf) + try: + self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) + self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) + except: + pass + try: + self.DataFrame = DataMatrix + except: + pass + self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) + self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) + self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) + self.df_shuf = self.df.reindex(self.df.index[self.shuf]) + + def time_join_dataframe_index_single_key_small(self): + self.df.join(self.df_key1, on='key1') + + +class join_dataframe_integer_2key(object): + goal_time = 0.2 + + def setup(self): + self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), }) + self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), }) + self.df3 = self.df[:5000] + + def time_join_dataframe_integer_2key(self): + merge(self.df, self.df3) + + +class join_dataframe_integer_key(object): + goal_time = 0.2 + + def setup(self): + self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), }) + self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), }) + self.df3 = self.df[:5000] + + def time_join_dataframe_integer_key(self): + merge(self.df, self.df2, on='key1') + + +class join_non_unique_equal(object): + goal_time = 0.2 + + def setup(self): + self.date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') + self.daily_dates = self.date_index.to_period('D').to_timestamp('S', 'S') + self.fracofday = (self.date_index.view(np.ndarray) - self.daily_dates.view(np.ndarray)) + self.fracofday = (self.fracofday.astype('timedelta64[ns]').astype(np.float64) / 86400000000000.0) + self.fracofday = TimeSeries(self.fracofday, self.daily_dates) + self.index = date_range(self.date_index.min().to_period('A').to_timestamp('D', 'S'), self.date_index.max().to_period('A').to_timestamp('D', 'E'), freq='D') + self.temp = TimeSeries(1.0, self.index) + + def time_join_non_unique_equal(self): + (self.fracofday * self.temp[self.fracofday.index]) + + +class left_outer_join_index(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(2718281) + self.n = 50000 + self.left = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jim', 'joe']) + self.right = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jolie', 'jolia']).set_index('jolie') + + def time_left_outer_join_index(self): + self.left.join(self.right, on='jim') + + +class merge_2intkey_nosort(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.indices = tm.makeStringIndex(self.N).values + self.indices2 = tm.makeStringIndex(self.N).values + self.key = np.tile(self.indices[:8000], 10) + self.key2 = np.tile(self.indices2[:8000], 10) + self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), }) + self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), }) + + def time_merge_2intkey_nosort(self): + merge(self.left, self.right, sort=False) + + +class merge_2intkey_sort(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.indices = tm.makeStringIndex(self.N).values + self.indices2 = tm.makeStringIndex(self.N).values + self.key = np.tile(self.indices[:8000], 10) + self.key2 = np.tile(self.indices2[:8000], 10) + self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), }) + self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), }) + + def time_merge_2intkey_sort(self): + merge(self.left, self.right, sort=True) + + +class series_align_int64_index(object): + goal_time = 0.2 + + def setup(self): + self.n = 1000000 + + def sample(values, k): + self.sampler = np.random.permutation(len(values)) + return values.take(self.sampler[:k]) + self.sz = 500000 + self.rng = np.arange(0, 10000000000000, 10000000) + self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng) + self.idx1 = np.sort(sample(self.stamps, self.sz)) + self.idx2 = np.sort(sample(self.stamps, self.sz)) + self.ts1 = Series(np.random.randn(self.sz), self.idx1) + self.ts2 = Series(np.random.randn(self.sz), self.idx2) + + def time_series_align_int64_index(self): + (self.ts1 + self.ts2) + + +class series_align_left_monotonic(object): + goal_time = 0.2 + + def setup(self): + self.n = 1000000 + + def sample(values, k): + self.sampler = np.random.permutation(len(values)) + return values.take(self.sampler[:k]) + self.sz = 500000 + self.rng = np.arange(0, 10000000000000, 10000000) + self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng) + self.idx1 = np.sort(sample(self.stamps, self.sz)) + self.idx2 = np.sort(sample(self.stamps, self.sz)) + self.ts1 = Series(np.random.randn(self.sz), self.idx1) + self.ts2 = Series(np.random.randn(self.sz), self.idx2) + + def time_series_align_left_monotonic(self): + self.ts1.align(self.ts2, join='left') \ No newline at end of file diff --git a/asv_bench/benchmarks/miscellaneous.py b/asv_bench/benchmarks/miscellaneous.py new file mode 100644 index 0000000000000..b9c02c85fb096 --- /dev/null +++ b/asv_bench/benchmarks/miscellaneous.py @@ -0,0 +1,30 @@ +from pandas_vb_common import * +from pandas.util.decorators import cache_readonly + + +class match_strings(object): + goal_time = 0.2 + + def setup(self): + self.uniques = tm.makeStringIndex(1000).values + self.all = self.uniques.repeat(10) + + def time_match_strings(self): + match(self.all, self.uniques) + + +class misc_cache_readonly(object): + goal_time = 0.2 + + def setup(self): + + + class Foo: + + @cache_readonly + def prop(self): + return 5 + self.obj = Foo() + + def time_misc_cache_readonly(self): + self.obj.prop \ No newline at end of file diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py new file mode 100644 index 0000000000000..81fa7c2238d16 --- /dev/null +++ b/asv_bench/benchmarks/packers.py @@ -0,0 +1,857 @@ +from numpy.random import randint +import pandas as pd +from collections import OrderedDict +from pandas.compat import BytesIO +import sqlite3 +from pandas_vb_common import * +import os +from sqlalchemy import create_engine +import numpy as np +from random import randrange +from pandas.core import common as com + + +class packers_read_csv(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df.to_csv(self.f) + + def time_packers_read_csv(self): + pd.read_csv(self.f) + + +class packers_read_excel(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.bio = BytesIO() + self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') + self.df[:2000].to_excel(self.writer) + self.writer.save() + + def time_packers_read_excel(self): + self.bio.seek(0) + pd.read_excel(self.bio) + + +class packers_read_hdf_store(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df2.to_hdf(self.f, 'df') + + def time_packers_read_hdf_store(self): + pd.read_hdf(self.f, 'df') + + +class packers_read_hdf_table(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df2.to_hdf(self.f, 'df', format='table') + + def time_packers_read_hdf_table(self): + pd.read_hdf(self.f, 'df') + + +class packers_read_json(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df.to_json(self.f, orient='split') + self.df.index = np.arange(self.N) + + def time_packers_read_json(self): + pd.read_json(self.f, orient='split') + + +class packers_read_json_date_index(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df.to_json(self.f, orient='split') + + def time_packers_read_json_date_index(self): + pd.read_json(self.f, orient='split') + + +class packers_read_pack(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df2.to_msgpack(self.f) + + def time_packers_read_pack(self): + pd.read_msgpack(self.f) + + +class packers_read_pickle(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df2.to_pickle(self.f) + + def time_packers_read_pickle(self): + pd.read_pickle(self.f) + + +class packers_read_sql(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.engine = create_engine('sqlite:///:memory:') + self.df2.to_sql('table', self.engine, if_exists='replace') + + def time_packers_read_sql(self): + pd.read_sql_table('table', self.engine) + + +class packers_read_stata(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df.to_stata(self.f, {'index': 'tc', }) + + def time_packers_read_stata(self): + pd.read_stata(self.f) + + +class packers_read_stata_with_validation(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] + self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] + self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] + self.df['float32_'] = np.array(randn(self.N), dtype=np.float32) + self.df.to_stata(self.f, {'index': 'tc', }) + + def time_packers_read_stata_with_validation(self): + pd.read_stata(self.f) + + +class packers_write_csv(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + + def time_packers_write_csv(self): + self.df.to_csv(self.f) + + def teardown(self): + remove(self.f) + + +class packers_write_excel_openpyxl(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.bio = BytesIO() + + def time_packers_write_excel_openpyxl(self): + self.bio.seek(0) + self.writer = pd.io.excel.ExcelWriter(self.bio, engine='openpyxl') + self.df[:2000].to_excel(self.writer) + self.writer.save() + + +class packers_write_excel_xlsxwriter(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.bio = BytesIO() + + def time_packers_write_excel_xlsxwriter(self): + self.bio.seek(0) + self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') + self.df[:2000].to_excel(self.writer) + self.writer.save() + + +class packers_write_excel_xlwt(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.bio = BytesIO() + + def time_packers_write_excel_xlwt(self): + self.bio.seek(0) + self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') + self.df[:2000].to_excel(self.writer) + self.writer.save() + + +class packers_write_hdf_store(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + + def time_packers_write_hdf_store(self): + self.df2.to_hdf(self.f, 'df') + + def teardown(self): + remove(self.f) + + +class packers_write_hdf_table(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + + def time_packers_write_hdf_table(self): + self.df2.to_hdf(self.f, 'df', table=True) + + def teardown(self): + remove(self.f) + + +class packers_write_json(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df.index = np.arange(self.N) + + def time_packers_write_json(self): + self.df.to_json(self.f, orient='split') + + def teardown(self): + remove(self.f) + + +class packers_write_json_T(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df.index = np.arange(self.N) + + def time_packers_write_json_T(self): + self.df.to_json(self.f, orient='columns') + + def teardown(self): + remove(self.f) + + +class packers_write_json_date_index(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + + def time_packers_write_json_date_index(self): + self.df.to_json(self.f, orient='split') + + def teardown(self): + remove(self.f) + + +class packers_write_json_mixed_delta_int_tstamp(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.cols = [(lambda i: ('{0}_timedelta'.format(i), [pd.Timedelta(('%d seconds' % randrange(1000000.0))) for _ in range(self.N)])), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_timestamp'.format(i), [pd.Timestamp((1418842918083256000 + randrange(1000000000.0, 1e+18, 200))) for _ in range(self.N)]))] + self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) + + def time_packers_write_json_mixed_delta_int_tstamp(self): + self.df_mixed.to_json(self.f, orient='split') + + def teardown(self): + remove(self.f) + + +class packers_write_json_mixed_float_int(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] + self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) + + def time_packers_write_json_mixed_float_int(self): + self.df_mixed.to_json(self.f, orient='index') + + def teardown(self): + remove(self.f) + + +class packers_write_json_mixed_float_int_T(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] + self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) + + def time_packers_write_json_mixed_float_int_T(self): + self.df_mixed.to_json(self.f, orient='columns') + + def teardown(self): + remove(self.f) + + +class packers_write_json_mixed_float_int_str(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))] + self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) + + def time_packers_write_json_mixed_float_int_str(self): + self.df_mixed.to_json(self.f, orient='split') + + def teardown(self): + remove(self.f) + + +class packers_write_pack(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + + def time_packers_write_pack(self): + self.df2.to_msgpack(self.f) + + def teardown(self): + remove(self.f) + + +class packers_write_pickle(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + + def time_packers_write_pickle(self): + self.df2.to_pickle(self.f) + + def teardown(self): + remove(self.f) + + +class packers_write_sql(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.engine = create_engine('sqlite:///:memory:') + + def time_packers_write_sql(self): + self.df2.to_sql('table', self.engine, if_exists='replace') + + +class packers_write_stata(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df.to_stata(self.f, {'index': 'tc', }) + + def time_packers_write_stata(self): + self.df.to_stata(self.f, {'index': 'tc', }) + + def teardown(self): + remove(self.f) + + +class packers_write_stata_with_validation(object): + goal_time = 0.2 + + def setup(self): + self.f = '__test__.msg' + + def remove(f): + try: + os.remove(self.f) + except: + pass + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.N = 100000 + self.C = 5 + self.index = date_range('20000101', periods=self.N, freq='H') + self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + remove(self.f) + self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] + self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] + self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] + self.df['float32_'] = np.array(randn(self.N), dtype=np.float32) + self.df.to_stata(self.f, {'index': 'tc', }) + + def time_packers_write_stata_with_validation(self): + self.df.to_stata(self.f, {'index': 'tc', }) + + def teardown(self): + remove(self.f) \ No newline at end of file diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py new file mode 120000 index 0000000000000..6e2e449a4c00a --- /dev/null +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -0,0 +1 @@ +../../vb_suite/pandas_vb_common.py \ No newline at end of file diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py new file mode 100644 index 0000000000000..c755cb122a0bf --- /dev/null +++ b/asv_bench/benchmarks/panel_ctor.py @@ -0,0 +1,64 @@ +from pandas_vb_common import * + + +class panel_from_dict_all_different_indexes(object): + goal_time = 0.2 + + def setup(self): + self.data_frames = {} + self.start = datetime(1990, 1, 1) + self.end = datetime(2012, 1, 1) + for x in xrange(100): + self.end += timedelta(days=1) + self.dr = np.asarray(date_range(self.start, self.end)) + self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) + self.data_frames[x] = self.df + + def time_panel_from_dict_all_different_indexes(self): + Panel.from_dict(self.data_frames) + + +class panel_from_dict_equiv_indexes(object): + goal_time = 0.2 + + def setup(self): + self.data_frames = {} + for x in xrange(100): + self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq=datetools.Day(1))) + self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) + self.data_frames[x] = self.df + + def time_panel_from_dict_equiv_indexes(self): + Panel.from_dict(self.data_frames) + + +class panel_from_dict_same_index(object): + goal_time = 0.2 + + def setup(self): + self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq=datetools.Day(1))) + self.data_frames = {} + for x in xrange(100): + self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) + self.data_frames[x] = self.df + + def time_panel_from_dict_same_index(self): + Panel.from_dict(self.data_frames) + + +class panel_from_dict_two_different_indexes(object): + goal_time = 0.2 + + def setup(self): + self.data_frames = {} + self.start = datetime(1990, 1, 1) + self.end = datetime(2012, 1, 1) + for x in xrange(100): + if (x == 50): + self.end += timedelta(days=1) + self.dr = np.asarray(date_range(self.start, self.end)) + self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) + self.data_frames[x] = self.df + + def time_panel_from_dict_two_different_indexes(self): + Panel.from_dict(self.data_frames) \ No newline at end of file diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py new file mode 100644 index 0000000000000..4145b68dca997 --- /dev/null +++ b/asv_bench/benchmarks/panel_methods.py @@ -0,0 +1,56 @@ +from pandas_vb_common import * + + +class panel_pct_change_items(object): + goal_time = 0.2 + + def setup(self): + self.index = date_range(start='2000', freq='D', periods=1000) + self.panel = Panel(np.random.randn(100, len(self.index), 1000)) + + def time_panel_pct_change_items(self): + self.panel.pct_change(1, axis='items') + + +class panel_pct_change_major(object): + goal_time = 0.2 + + def setup(self): + self.index = date_range(start='2000', freq='D', periods=1000) + self.panel = Panel(np.random.randn(100, len(self.index), 1000)) + + def time_panel_pct_change_major(self): + self.panel.pct_change(1, axis='major') + + +class panel_pct_change_minor(object): + goal_time = 0.2 + + def setup(self): + self.index = date_range(start='2000', freq='D', periods=1000) + self.panel = Panel(np.random.randn(100, len(self.index), 1000)) + + def time_panel_pct_change_minor(self): + self.panel.pct_change(1, axis='minor') + + +class panel_shift(object): + goal_time = 0.2 + + def setup(self): + self.index = date_range(start='2000', freq='D', periods=1000) + self.panel = Panel(np.random.randn(100, len(self.index), 1000)) + + def time_panel_shift(self): + self.panel.shift(1) + + +class panel_shift_minor(object): + goal_time = 0.2 + + def setup(self): + self.index = date_range(start='2000', freq='D', periods=1000) + self.panel = Panel(np.random.randn(100, len(self.index), 1000)) + + def time_panel_shift_minor(self): + self.panel.shift(1, axis='minor') \ No newline at end of file diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py new file mode 100644 index 0000000000000..46167dc2bb33c --- /dev/null +++ b/asv_bench/benchmarks/parser_vb.py @@ -0,0 +1,109 @@ +from cStringIO import StringIO +from pandas_vb_common import * +import os +from pandas import read_csv, read_table + + +class read_csv_comment2(object): + goal_time = 0.2 + + def setup(self): + self.data = ['A,B,C'] + self.data = (self.data + (['1,2,3 # comment'] * 100000)) + self.data = '\n'.join(self.data) + + def time_read_csv_comment2(self): + read_csv(StringIO(self.data), comment='#') + + +class read_csv_default_converter(object): + goal_time = 0.2 + + def setup(self): + self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) + + +class read_csv_precise_converter(object): + goal_time = 0.2 + + def setup(self): + self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_precise_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, float_precision='high') + + +class read_csv_roundtrip_converter(object): + goal_time = 0.2 + + def setup(self): + self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_roundtrip_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, float_precision='round_trip') + + +class read_csv_thou_vb(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 8 + self.format = (lambda x: '{:,}'.format(x)) + self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) + self.df = self.df.applymap(self.format) + self.df.to_csv('test.csv', sep='|') + + def time_read_csv_thou_vb(self): + read_csv('test.csv', sep='|', thousands=',') + + def teardown(self): + os.remove('test.csv') + + +class read_csv_vb(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 8 + self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) + self.df.to_csv('test.csv', sep='|') + + def time_read_csv_vb(self): + read_csv('test.csv', sep='|') + + def teardown(self): + os.remove('test.csv') + + +class read_table_multiple_date(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 8 + self.data = 'KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' + self.data = (self.data * 200) + + def time_read_table_multiple_date(self): + read_table(StringIO(self.data), sep=',', header=None, parse_dates=[[1, 2], [1, 3]]) + + +class read_table_multiple_date_baseline(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 8 + self.data = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' + self.data = (self.data * 200) + + def time_read_table_multiple_date_baseline(self): + read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) \ No newline at end of file diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py new file mode 100644 index 0000000000000..d1df1b429c656 --- /dev/null +++ b/asv_bench/benchmarks/plotting.py @@ -0,0 +1,19 @@ +from pandas_vb_common import * +try: + from pandas import date_range +except ImportError: + + def date_range(start=None, end=None, periods=None, freq=None): + return DatetimeIndex(start, end, periods=periods, offset=freq) + + +class plot_timeseries_period(object): + goal_time = 0.2 + + def setup(self): + self.N = 2000 + self.M = 5 + self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N)) + + def time_plot_timeseries_period(self): + self.df.plot() \ No newline at end of file diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py new file mode 100644 index 0000000000000..d6fbd0d31c389 --- /dev/null +++ b/asv_bench/benchmarks/reindex.py @@ -0,0 +1,384 @@ +from pandas_vb_common import * +from random import shuffle + + +class dataframe_reindex(object): + goal_time = 0.2 + + def setup(self): + self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq=datetools.Minute()) + self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, columns=range(10)) + self.df['foo'] = 'bar' + self.rng2 = Index(self.rng[::2]) + + def time_dataframe_reindex(self): + self.df.reindex(self.rng2) + + +class frame_drop_dup_inplace(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) + + def time_frame_drop_dup_inplace(self): + self.df.drop_duplicates(['key1', 'key2'], inplace=True) + + +class frame_drop_dup_na_inplace(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) + self.df.ix[:10000, :] = np.nan + + def time_frame_drop_dup_na_inplace(self): + self.df.drop_duplicates(['key1', 'key2'], inplace=True) + + +class frame_drop_duplicates(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) + + def time_frame_drop_duplicates(self): + self.df.drop_duplicates(['key1', 'key2']) + + +class frame_drop_duplicates_na(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) + self.df.ix[:10000, :] = np.nan + + def time_frame_drop_duplicates_na(self): + self.df.drop_duplicates(['key1', 'key2']) + + +class frame_fillna_many_columns_pad(object): + goal_time = 0.2 + + def setup(self): + self.values = np.random.randn(1000, 1000) + self.values[::2] = np.nan + self.df = DataFrame(self.values) + + def time_frame_fillna_many_columns_pad(self): + self.df.fillna(method='pad') + + +class frame_reindex_columns(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30)) + + def time_frame_reindex_columns(self): + self.df.reindex(columns=self.df.columns[1:5]) + + +class frame_sort_index_by_columns(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) + + def time_frame_sort_index_by_columns(self): + self.df.sort_index(by=['key1', 'key2']) + + +class lib_fast_zip(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) + + def time_lib_fast_zip(self): + lib.fast_zip(self.col_array_list) + + +class lib_fast_zip_fillna(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) + self.df.ix[:10000, :] = np.nan + + def time_lib_fast_zip_fillna(self): + lib.fast_zip_fillna(self.col_array_list) + + +class reindex_daterange_backfill(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + self.ts2 = self.ts[::2] + self.ts3 = self.ts2.reindex(self.ts.index) + self.ts4 = self.ts3.astype('float32') + + def pad(source_series, target_index): + try: + source_series.reindex(target_index, method='pad') + except: + source_series.reindex(target_index, fillMethod='pad') + + def backfill(source_series, target_index): + try: + source_series.reindex(target_index, method='backfill') + except: + source_series.reindex(target_index, fillMethod='backfill') + + def time_reindex_daterange_backfill(self): + backfill(self.ts2, self.ts.index) + + +class reindex_daterange_pad(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + self.ts2 = self.ts[::2] + self.ts3 = self.ts2.reindex(self.ts.index) + self.ts4 = self.ts3.astype('float32') + + def pad(source_series, target_index): + try: + source_series.reindex(target_index, method='pad') + except: + source_series.reindex(target_index, fillMethod='pad') + + def backfill(source_series, target_index): + try: + source_series.reindex(target_index, method='backfill') + except: + source_series.reindex(target_index, fillMethod='backfill') + + def time_reindex_daterange_pad(self): + pad(self.ts2, self.ts.index) + + +class reindex_fillna_backfill(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + self.ts2 = self.ts[::2] + self.ts3 = self.ts2.reindex(self.ts.index) + self.ts4 = self.ts3.astype('float32') + + def pad(source_series, target_index): + try: + source_series.reindex(target_index, method='pad') + except: + source_series.reindex(target_index, fillMethod='pad') + + def backfill(source_series, target_index): + try: + source_series.reindex(target_index, method='backfill') + except: + source_series.reindex(target_index, fillMethod='backfill') + + def time_reindex_fillna_backfill(self): + self.ts3.fillna(method='backfill') + + +class reindex_fillna_backfill_float32(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + self.ts2 = self.ts[::2] + self.ts3 = self.ts2.reindex(self.ts.index) + self.ts4 = self.ts3.astype('float32') + + def pad(source_series, target_index): + try: + source_series.reindex(target_index, method='pad') + except: + source_series.reindex(target_index, fillMethod='pad') + + def backfill(source_series, target_index): + try: + source_series.reindex(target_index, method='backfill') + except: + source_series.reindex(target_index, fillMethod='backfill') + + def time_reindex_fillna_backfill_float32(self): + self.ts4.fillna(method='backfill') + + +class reindex_fillna_pad(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + self.ts2 = self.ts[::2] + self.ts3 = self.ts2.reindex(self.ts.index) + self.ts4 = self.ts3.astype('float32') + + def pad(source_series, target_index): + try: + source_series.reindex(target_index, method='pad') + except: + source_series.reindex(target_index, fillMethod='pad') + + def backfill(source_series, target_index): + try: + source_series.reindex(target_index, method='backfill') + except: + source_series.reindex(target_index, fillMethod='backfill') + + def time_reindex_fillna_pad(self): + self.ts3.fillna(method='pad') + + +class reindex_fillna_pad_float32(object): + goal_time = 0.2 + + def setup(self): + self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + self.ts2 = self.ts[::2] + self.ts3 = self.ts2.reindex(self.ts.index) + self.ts4 = self.ts3.astype('float32') + + def pad(source_series, target_index): + try: + source_series.reindex(target_index, method='pad') + except: + source_series.reindex(target_index, fillMethod='pad') + + def backfill(source_series, target_index): + try: + source_series.reindex(target_index, method='backfill') + except: + source_series.reindex(target_index, fillMethod='backfill') + + def time_reindex_fillna_pad_float32(self): + self.ts4.fillna(method='pad') + + +class reindex_frame_level_align(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + random.shuffle(self.index.values) + self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + + def time_reindex_frame_level_align(self): + self.df.align(self.df_level, level=1, copy=False) + + +class reindex_frame_level_reindex(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + random.shuffle(self.index.values) + self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + + def time_reindex_frame_level_reindex(self): + self.df_level.reindex(self.df.index, level=1) + + +class reindex_multiindex(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000 + self.K = 20 + self.level1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.level2 = np.tile(tm.makeStringIndex(self.K).values, self.N) + self.index = MultiIndex.from_arrays([self.level1, self.level2]) + self.s1 = Series(np.random.randn((self.N * self.K)), index=self.index) + self.s2 = self.s1[::2] + + def time_reindex_multiindex(self): + self.s1.reindex(self.s2.index) + + +class series_align_irregular_string(object): + goal_time = 0.2 + + def setup(self): + self.n = 50000 + self.indices = tm.makeStringIndex(self.n) + + def sample(values, k): + self.sampler = np.arange(len(values)) + shuffle(self.sampler) + return values.take(self.sampler[:k]) + self.subsample_size = 40000 + self.x = Series(np.random.randn(50000), self.indices) + self.y = Series(np.random.randn(self.subsample_size), index=sample(self.indices, self.subsample_size)) + + def time_series_align_irregular_string(self): + (self.x + self.y) + + +class series_drop_duplicates_int(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randint(0, 1000, size=10000)) + self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + + def time_series_drop_duplicates_int(self): + self.s.drop_duplicates() + + +class series_drop_duplicates_string(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randint(0, 1000, size=10000)) + self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + + def time_series_drop_duplicates_string(self): + self.s2.drop_duplicates() \ No newline at end of file diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py new file mode 100644 index 0000000000000..9b78c287c5ad4 --- /dev/null +++ b/asv_bench/benchmarks/replace.py @@ -0,0 +1,48 @@ +from pandas_vb_common import * +from pandas.compat import range +from datetime import timedelta + + +class replace_fillna(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + try: + self.rng = date_range('1/1/2000', periods=self.N, freq='min') + except NameError: + self.rng = DatetimeIndex('1/1/2000', periods=self.N, offset=datetools.Minute()) + self.date_range = DateRange + self.ts = Series(np.random.randn(self.N), index=self.rng) + + def time_replace_fillna(self): + self.ts.fillna(0.0, inplace=True) + + +class replace_large_dict(object): + goal_time = 0.2 + + def setup(self): + self.n = (10 ** 6) + self.start_value = (10 ** 5) + self.to_rep = dict(((i, (self.start_value + i)) for i in range(self.n))) + self.s = Series(np.random.randint(self.n, size=(10 ** 3))) + + def time_replace_large_dict(self): + self.s.replace(self.to_rep, inplace=True) + + +class replace_replacena(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + try: + self.rng = date_range('1/1/2000', periods=self.N, freq='min') + except NameError: + self.rng = DatetimeIndex('1/1/2000', periods=self.N, offset=datetools.Minute()) + self.date_range = DateRange + self.ts = Series(np.random.randn(self.N), index=self.rng) + + def time_replace_replacena(self): + self.ts.replace(np.nan, 0.0, inplace=True) \ No newline at end of file diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py new file mode 100644 index 0000000000000..b4081957af97b --- /dev/null +++ b/asv_bench/benchmarks/reshape.py @@ -0,0 +1,76 @@ +from pandas_vb_common import * +from pandas.core.reshape import melt + + +class melt_dataframe(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) + self.df = DataFrame(np.random.randn(10000, 4), index=self.index) + self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C']) + self.df['id1'] = np.random.randint(0, 10, 10000) + self.df['id2'] = np.random.randint(100, 1000, 10000) + + def time_melt_dataframe(self): + melt(self.df, id_vars=['id1', 'id2']) + + +class reshape_pivot_time_series(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) + self.df = DataFrame(np.random.randn(10000, 4), index=self.index) + + def unpivot(frame): + (N, K) = frame.shape + self.data = {'value': frame.values.ravel('F'), 'variable': np.asarray(frame.columns).repeat(N), 'date': np.tile(np.asarray(frame.index), K), } + return DataFrame(self.data, columns=['date', 'variable', 'value']) + self.index = date_range('1/1/2000', periods=10000, freq='h') + self.df = DataFrame(randn(10000, 50), index=self.index, columns=range(50)) + self.pdf = unpivot(self.df) + self.f = (lambda : self.pdf.pivot('date', 'variable', 'value')) + + def time_reshape_pivot_time_series(self): + self.f() + + +class reshape_stack_simple(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) + self.df = DataFrame(np.random.randn(10000, 4), index=self.index) + self.udf = self.df.unstack(1) + + def time_reshape_stack_simple(self): + self.udf.stack() + + +class reshape_unstack_simple(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) + self.df = DataFrame(np.random.randn(10000, 4), index=self.index) + + def time_reshape_unstack_simple(self): + self.df.unstack(1) + + +class unstack_sparse_keyspace(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) + self.df = DataFrame(np.random.randn(10000, 4), index=self.index) + self.NUM_ROWS = 1000 + for iter in range(10): + self.df = DataFrame({'A': np.random.randint(50, size=self.NUM_ROWS), 'B': np.random.randint(50, size=self.NUM_ROWS), 'C': np.random.randint((-10), 10, size=self.NUM_ROWS), 'D': np.random.randint((-10), 10, size=self.NUM_ROWS), 'E': np.random.randint(10, size=self.NUM_ROWS), 'F': np.random.randn(self.NUM_ROWS), }) + self.idf = self.df.set_index(['A', 'B', 'C', 'D', 'E']) + if (len(self.idf.index.unique()) == self.NUM_ROWS): + break + + def time_unstack_sparse_keyspace(self): + self.idf.unstack() \ No newline at end of file diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py new file mode 100644 index 0000000000000..9cd61c741dae1 --- /dev/null +++ b/asv_bench/benchmarks/series_methods.py @@ -0,0 +1,74 @@ +from pandas_vb_common import * + + +class series_isin_int64(object): + goal_time = 0.2 + + def setup(self): + self.s1 = Series(np.random.randn(10000)) + self.s2 = Series(np.random.randint(1, 10, 10000)) + self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') + self.values = [1, 2] + self.s4 = self.s3.astype('object') + + def time_series_isin_int64(self): + self.s3.isin(self.values) + + +class series_isin_object(object): + goal_time = 0.2 + + def setup(self): + self.s1 = Series(np.random.randn(10000)) + self.s2 = Series(np.random.randint(1, 10, 10000)) + self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') + self.values = [1, 2] + self.s4 = self.s3.astype('object') + + def time_series_isin_object(self): + self.s4.isin(self.values) + + +class series_nlargest1(object): + goal_time = 0.2 + + def setup(self): + self.s1 = Series(np.random.randn(10000)) + self.s2 = Series(np.random.randint(1, 10, 10000)) + self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') + self.values = [1, 2] + self.s4 = self.s3.astype('object') + + def time_series_nlargest1(self): + self.s1.nlargest(3, take_last=True) + self.s1.nlargest(3, take_last=False) + + +class series_nlargest2(object): + goal_time = 0.2 + + def setup(self): + self.s1 = Series(np.random.randn(10000)) + self.s2 = Series(np.random.randint(1, 10, 10000)) + self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') + self.values = [1, 2] + self.s4 = self.s3.astype('object') + + def time_series_nlargest2(self): + self.s2.nlargest(3, take_last=True) + self.s2.nlargest(3, take_last=False) + + +class series_nsmallest2(object): + goal_time = 0.2 + + def setup(self): + self.s1 = Series(np.random.randn(10000)) + self.s2 = Series(np.random.randint(1, 10, 10000)) + self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') + self.values = [1, 2] + self.s4 = self.s3.astype('object') + + def time_series_nsmallest2(self): + self.s2.nsmallest(3, take_last=True) + self.s2.nsmallest(3, take_last=False) \ No newline at end of file diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py new file mode 100644 index 0000000000000..dbf35f5e40f55 --- /dev/null +++ b/asv_bench/benchmarks/sparse.py @@ -0,0 +1,55 @@ +from pandas_vb_common import * +import scipy.sparse +import pandas.sparse.series +from pandas.core.sparse import SparseSeries, SparseDataFrame +from pandas.core.sparse import SparseDataFrame + + +class sparse_series_to_frame(object): + goal_time = 0.2 + + def setup(self): + self.K = 50 + self.N = 50000 + self.rng = np.asarray(date_range('1/1/2000', periods=self.N, freq='T')) + self.series = {} + for i in range(1, (self.K + 1)): + self.data = np.random.randn(self.N)[:(- i)] + self.this_rng = self.rng[:(- i)] + self.data[100:] = np.nan + self.series[i] = SparseSeries(self.data, index=self.this_rng) + + def time_sparse_series_to_frame(self): + SparseDataFrame(self.series) + + +class sparse_frame_constructor(object): + goal_time = 0.2 + + def time_sparse_frame_constructor(self): + SparseDataFrame(columns=np.arange(100), index=np.arange(1000)) + + +class sparse_series_from_coo(object): + goal_time = 0.2 + + def setup(self): + self.A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)) + + def time_sparse_series_from_coo(self): + self.ss = pandas.sparse.series.SparseSeries.from_coo(self.A) + + +class sparse_series_to_coo(object): + goal_time = 0.2 + + def setup(self): + self.s = pd.Series(([np.nan] * 10000)) + self.s[0] = 3.0 + self.s[100] = (-1.0) + self.s[999] = 12.1 + self.s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10))) + self.ss = self.s.to_sparse() + + def time_sparse_series_to_coo(self): + self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) \ No newline at end of file diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py new file mode 100644 index 0000000000000..98e2bbfce1a44 --- /dev/null +++ b/asv_bench/benchmarks/stat_ops.py @@ -0,0 +1,236 @@ +from pandas_vb_common import * + + +class stat_ops_frame_mean_float_axis_0(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(100000, 4)) + self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + + def time_stat_ops_frame_mean_float_axis_0(self): + self.df.mean() + + +class stat_ops_frame_mean_float_axis_1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(100000, 4)) + self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + + def time_stat_ops_frame_mean_float_axis_1(self): + self.df.mean(1) + + +class stat_ops_frame_mean_int_axis_0(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(100000, 4)) + self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + + def time_stat_ops_frame_mean_int_axis_0(self): + self.dfi.mean() + + +class stat_ops_frame_mean_int_axis_1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(100000, 4)) + self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + + def time_stat_ops_frame_mean_int_axis_1(self): + self.dfi.mean(1) + + +class stat_ops_frame_sum_float_axis_0(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(100000, 4)) + self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + + def time_stat_ops_frame_sum_float_axis_0(self): + self.df.sum() + + +class stat_ops_frame_sum_float_axis_1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(100000, 4)) + self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + + def time_stat_ops_frame_sum_float_axis_1(self): + self.df.sum(1) + + +class stat_ops_frame_sum_int_axis_0(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(100000, 4)) + self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + + def time_stat_ops_frame_sum_int_axis_0(self): + self.dfi.sum() + + +class stat_ops_frame_sum_int_axis_1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(100000, 4)) + self.dfi = DataFrame(np.random.randint(1000, size=self.df.shape)) + + def time_stat_ops_frame_sum_int_axis_1(self): + self.dfi.sum(1) + + +class stat_ops_level_frame_sum(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + random.shuffle(self.index.values) + self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + + def time_stat_ops_level_frame_sum(self): + self.df.sum(level=1) + + +class stat_ops_level_frame_sum_multiple(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + random.shuffle(self.index.values) + self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + + def time_stat_ops_level_frame_sum_multiple(self): + self.df.sum(level=[0, 1]) + + +class stat_ops_level_series_sum(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + random.shuffle(self.index.values) + self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + + def time_stat_ops_level_series_sum(self): + self.df[1].sum(level=1) + + +class stat_ops_level_series_sum_multiple(object): + goal_time = 0.2 + + def setup(self): + self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + random.shuffle(self.index.values) + self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + + def time_stat_ops_level_series_sum_multiple(self): + self.df[1].sum(level=[0, 1]) + + +class stat_ops_series_std(object): + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randn(100000), index=np.arange(100000)) + self.s[::2] = np.nan + + def time_stat_ops_series_std(self): + self.s.std() + + +class stats_corr_spearman(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 30)) + + def time_stats_corr_spearman(self): + self.df.corr(method='spearman') + + +class stats_rank2d_axis0_average(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(5000, 50)) + + def time_stats_rank2d_axis0_average(self): + self.df.rank() + + +class stats_rank2d_axis1_average(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(5000, 50)) + + def time_stats_rank2d_axis1_average(self): + self.df.rank(1) + + +class stats_rank_average(object): + goal_time = 0.2 + + def setup(self): + self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) + self.s = Series(self.values) + + def time_stats_rank_average(self): + self.s.rank() + + +class stats_rank_average_int(object): + goal_time = 0.2 + + def setup(self): + self.values = np.random.randint(0, 100000, size=200000) + self.s = Series(self.values) + + def time_stats_rank_average_int(self): + self.s.rank() + + +class stats_rank_pct_average(object): + goal_time = 0.2 + + def setup(self): + self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) + self.s = Series(self.values) + + def time_stats_rank_pct_average(self): + self.s.rank(pct=True) + + +class stats_rank_pct_average_old(object): + goal_time = 0.2 + + def setup(self): + self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) + self.s = Series(self.values) + + def time_stats_rank_pct_average_old(self): + (self.s.rank() / len(self.s)) + + +class stats_rolling_mean(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randn(100000) + + def time_stats_rolling_mean(self): + rolling_mean(self.arr, 100) \ No newline at end of file diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py new file mode 100644 index 0000000000000..5adfbf4c2557d --- /dev/null +++ b/asv_bench/benchmarks/strings.py @@ -0,0 +1,393 @@ +from pandas_vb_common import * +import string +import itertools as IT +import pandas.util.testing as testing + + +class strings_cat(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_cat(self): + self.many.str.cat(sep=',') + + +class strings_center(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_center(self): + self.many.str.center(100) + + +class strings_contains_few(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_contains_few(self): + self.few.str.contains('matchthis') + + +class strings_contains_few_noregex(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_contains_few_noregex(self): + self.few.str.contains('matchthis', regex=False) + + +class strings_contains_many(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_contains_many(self): + self.many.str.contains('matchthis') + + +class strings_contains_many_noregex(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_contains_many_noregex(self): + self.many.str.contains('matchthis', regex=False) + + +class strings_count(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_count(self): + self.many.str.count('matchthis') + + +class strings_encode_decode(object): + goal_time = 0.2 + + def setup(self): + self.ser = Series(testing.makeUnicodeIndex()) + + def time_strings_encode_decode(self): + self.ser.str.encode('utf-8').str.decode('utf-8') + + +class strings_endswith(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_endswith(self): + self.many.str.endswith('matchthis') + + +class strings_extract(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_extract(self): + self.many.str.extract('(\\w*)matchthis(\\w*)') + + +class strings_findall(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_findall(self): + self.many.str.findall('[A-Z]+') + + +class strings_get(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_get(self): + self.many.str.get(0) + + +class strings_get_dummies(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + self.s = make_series(string.uppercase, strlen=10, size=10000).str.join('|') + + def time_strings_get_dummies(self): + self.s.str.get_dummies('|') + + +class strings_join_split(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_join_split(self): + self.many.str.join('--').str.split('--') + + +class strings_join_split_expand(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_join_split_expand(self): + self.many.str.join('--').str.split('--', expand=True) + + +class strings_len(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_len(self): + self.many.str.len() + + +class strings_lower(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_lower(self): + self.many.str.lower() + + +class strings_lstrip(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_lstrip(self): + self.many.str.lstrip('matchthis') + + +class strings_match(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_match(self): + self.many.str.match('mat..this') + + +class strings_pad(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_pad(self): + self.many.str.pad(100, side='both') + + +class strings_repeat(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_repeat(self): + self.many.str.repeat(list(IT.islice(IT.cycle(range(1, 4)), len(self.many)))) + + +class strings_replace(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_replace(self): + self.many.str.replace('(matchthis)', '\x01\x01') + + +class strings_rstrip(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_rstrip(self): + self.many.str.rstrip('matchthis') + + +class strings_slice(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_slice(self): + self.many.str.slice(5, 15, 2) + + +class strings_startswith(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_startswith(self): + self.many.str.startswith('matchthis') + + +class strings_strip(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_strip(self): + self.many.str.strip('matchthis') + + +class strings_title(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_title(self): + self.many.str.title() + + +class strings_upper(object): + goal_time = 0.2 + + def setup(self): + + def make_series(letters, strlen, size): + return Series(np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))) + self.many = make_series(('matchthis' + string.uppercase), strlen=19, size=10000) + self.few = make_series(('matchthis' + (string.uppercase * 42)), strlen=19, size=10000) + + def time_strings_upper(self): + self.many.str.upper() \ No newline at end of file diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py new file mode 100644 index 0000000000000..36a0f98e3f5ef --- /dev/null +++ b/asv_bench/benchmarks/timedelta.py @@ -0,0 +1,34 @@ +from pandas_vb_common import * +from pandas import to_timedelta + + +class timedelta_convert_int(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randint(0, 1000, size=10000) + + def time_timedelta_convert_int(self): + to_timedelta(self.arr, unit='s') + + +class timedelta_convert_string(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randint(0, 1000, size=10000) + self.arr = ['{0} days'.format(i) for i in self.arr] + + def time_timedelta_convert_string(self): + to_timedelta(self.arr) + + +class timedelta_convert_string_seconds(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randint(0, 60, size=10000) + self.arr = ['00:00:{0:02d}'.format(i) for i in self.arr] + + def time_timedelta_convert_string_seconds(self): + to_timedelta(self.arr) \ No newline at end of file diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py new file mode 100644 index 0000000000000..266c198de1455 --- /dev/null +++ b/asv_bench/benchmarks/timeseries.py @@ -0,0 +1,1046 @@ +from pandas.tseries.converter import DatetimeConverter +import pandas as pd +from datetime import timedelta +import datetime as dt +from pandas_vb_common import * +from pandas.tseries.frequencies import infer_freq +import pandas.tseries.holiday +import numpy as np + + +class dataframe_resample_max_numpy(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='20130101', periods=100000, freq='50L') + self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + + def time_dataframe_resample_max_numpy(self): + self.df.resample('1s', how=np.max) + + +class dataframe_resample_max_string(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='20130101', periods=100000, freq='50L') + self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + + def time_dataframe_resample_max_string(self): + self.df.resample('1s', how='max') + + +class dataframe_resample_mean_numpy(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='20130101', periods=100000, freq='50L') + self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + + def time_dataframe_resample_mean_numpy(self): + self.df.resample('1s', how=np.mean) + + +class dataframe_resample_mean_string(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='20130101', periods=100000, freq='50L') + self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + + def time_dataframe_resample_mean_string(self): + self.df.resample('1s', how='mean') + + +class dataframe_resample_min_numpy(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='20130101', periods=100000, freq='50L') + self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + + def time_dataframe_resample_min_numpy(self): + self.df.resample('1s', how=np.min) + + +class dataframe_resample_min_string(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='20130101', periods=100000, freq='50L') + self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + + def time_dataframe_resample_min_string(self): + self.df.resample('1s', how='min') + + +class datetimeindex_add_offset(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', periods=10000, freq='T') + + def time_datetimeindex_add_offset(self): + (self.rng + timedelta(minutes=2)) + + +class datetimeindex_converter(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + + def time_datetimeindex_converter(self): + DatetimeConverter.convert(self.rng, None, None) + + +class datetimeindex_infer_dst(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') + self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') + self.index = self.index.append(self.dst_rng) + self.index = self.index.append(self.dst_rng) + self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) + + def time_datetimeindex_infer_dst(self): + self.index.tz_localize('US/Eastern', infer_dst=True) + + +class datetimeindex_normalize(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') + + def time_datetimeindex_normalize(self): + self.rng.normalize() + + +class datetimeindex_unique(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', periods=1000, freq='T') + self.index = self.rng.repeat(10) + + def time_datetimeindex_unique(self): + self.index.unique() + + +class dti_reset_index(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', periods=1000, freq='H') + self.df = DataFrame(np.random.randn(len(self.rng), 2), self.rng) + + def time_dti_reset_index(self): + self.df.reset_index() + + +class dti_reset_index_tz(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') + self.df = DataFrame(np.random.randn(len(self.rng), 2), index=self.rng) + + def time_dti_reset_index_tz(self): + self.df.reset_index() + + +class period_setitem(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = period_range(start='1/1/1990', freq='S', periods=20000) + self.df = DataFrame(index=range(len(self.rng))) + + def time_period_setitem(self): + self.df['col'] = self.rng + + +class timeseries_1min_5min_mean(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + + def time_timeseries_1min_5min_mean(self): + self.ts[:10000].resample('5min', how='mean') + + +class timeseries_1min_5min_ohlc(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + + def time_timeseries_1min_5min_ohlc(self): + self.ts[:10000].resample('5min', how='ohlc') + + +class timeseries_add_irregular(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.lindex = np.random.permutation(self.N)[:(self.N // 2)] + self.rindex = np.random.permutation(self.N)[:(self.N // 2)] + self.left = Series(self.ts.values.take(self.lindex), index=self.ts.index.take(self.lindex)) + self.right = Series(self.ts.values.take(self.rindex), index=self.ts.index.take(self.rindex)) + + def time_timeseries_add_irregular(self): + (self.left + self.right) + + +class timeseries_asof(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 10000 + self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + + def time_timeseries_asof(self): + self.ts.asof(self.dates) + + +class timeseries_asof_nan(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 10000 + self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + self.ts[250:5000] = np.nan + + def time_timeseries_asof_nan(self): + self.ts.asof(self.dates) + + +class timeseries_asof_single(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 10000 + self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + + def time_timeseries_asof_single(self): + self.ts.asof(self.dates[0]) + + +class timeseries_custom_bday_apply(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bday_apply(self): + self.cday.apply(self.date) + + +class timeseries_custom_bday_apply_dt64(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bday_apply_dt64(self): + self.cday.apply(self.dt64) + + +class timeseries_custom_bday_cal_decr(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bday_cal_decr(self): + (self.date - (1 * self.cdayh)) + + +class timeseries_custom_bday_cal_incr(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bday_cal_incr(self): + (self.date + (1 * self.cdayh)) + + +class timeseries_custom_bday_cal_incr_n(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bday_cal_incr_n(self): + (self.date + (10 * self.cdayh)) + + +class timeseries_custom_bday_cal_incr_neg_n(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bday_cal_incr_neg_n(self): + (self.date - (10 * self.cdayh)) + + +class timeseries_custom_bday_decr(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bday_decr(self): + (self.date - self.cday) + + +class timeseries_custom_bday_incr(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bday_incr(self): + (self.date + self.cday) + + +class timeseries_custom_bmonthbegin_decr_n(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bmonthbegin_decr_n(self): + (self.date - (10 * self.cmb)) + + +class timeseries_custom_bmonthbegin_incr_n(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bmonthbegin_incr_n(self): + (self.date + (10 * self.cmb)) + + +class timeseries_custom_bmonthend_decr_n(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bmonthend_decr_n(self): + (self.date - (10 * self.cme)) + + +class timeseries_custom_bmonthend_incr(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bmonthend_incr(self): + (self.date + self.cme) + + +class timeseries_custom_bmonthend_incr_n(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_custom_bmonthend_incr_n(self): + (self.date + (10 * self.cme)) + + +class timeseries_day_apply(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_day_apply(self): + self.day.apply(self.date) + + +class timeseries_day_incr(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_day_incr(self): + (self.date + self.day) + + +class timeseries_infer_freq(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/1700', freq='D', periods=100000) + self.a = self.rng[:50000].append(self.rng[50002:]) + + def time_timeseries_infer_freq(self): + infer_freq(self.a) + + +class timeseries_is_month_start(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 10000 + self.rng = date_range(start='1/1/1', periods=self.N, freq='B') + + def time_timeseries_is_month_start(self): + self.rng.is_month_start + + +class timeseries_iter_datetimeindex(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 1000000 + self.M = 10000 + self.idx1 = date_range(start='20140101', freq='T', periods=self.N) + self.idx2 = period_range(start='20140101', freq='T', periods=self.N) + + def iter_n(iterable, n=None): + self.i = 0 + for _ in iterable: + self.i += 1 + if ((n is not None) and (self.i > n)): + break + + def time_timeseries_iter_datetimeindex(self): + iter_n(self.idx1) + + +class timeseries_iter_datetimeindex_preexit(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 1000000 + self.M = 10000 + self.idx1 = date_range(start='20140101', freq='T', periods=self.N) + self.idx2 = period_range(start='20140101', freq='T', periods=self.N) + + def iter_n(iterable, n=None): + self.i = 0 + for _ in iterable: + self.i += 1 + if ((n is not None) and (self.i > n)): + break + + def time_timeseries_iter_datetimeindex_preexit(self): + iter_n(self.idx1, self.M) + + +class timeseries_iter_periodindex(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 1000000 + self.M = 10000 + self.idx1 = date_range(start='20140101', freq='T', periods=self.N) + self.idx2 = period_range(start='20140101', freq='T', periods=self.N) + + def iter_n(iterable, n=None): + self.i = 0 + for _ in iterable: + self.i += 1 + if ((n is not None) and (self.i > n)): + break + + def time_timeseries_iter_periodindex(self): + iter_n(self.idx2) + + +class timeseries_iter_periodindex_preexit(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 1000000 + self.M = 10000 + self.idx1 = date_range(start='20140101', freq='T', periods=self.N) + self.idx2 = period_range(start='20140101', freq='T', periods=self.N) + + def iter_n(iterable, n=None): + self.i = 0 + for _ in iterable: + self.i += 1 + if ((n is not None) and (self.i > n)): + break + + def time_timeseries_iter_periodindex_preexit(self): + iter_n(self.idx2, self.M) + + +class timeseries_large_lookup_value(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', periods=1500000, freq='S') + self.ts = Series(1, index=self.rng) + + def time_timeseries_large_lookup_value(self): + self.ts[self.ts.index[(len(self.ts) // 2)]] + self.ts.index._cleanup() + + +class timeseries_period_downsample_mean(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = period_range(start='1/1/2000', end='1/1/2001', freq='T') + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + + def time_timeseries_period_downsample_mean(self): + self.ts.resample('D', how='mean') + + +class timeseries_resample_datetime64(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') + self.int_ts = Series(5, self.rng, dtype='int64') + self.ts = self.int_ts.astype('datetime64[ns]') + + def time_timeseries_resample_datetime64(self): + self.ts.resample('1S', how='last') + + +class timeseries_slice_minutely(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + + def time_timeseries_slice_minutely(self): + self.ts[:10000] + + +class timeseries_sort_index(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='s') + self.rng = self.rng.take(np.random.permutation(self.N)) + self.ts = Series(np.random.randn(self.N), index=self.rng) + + def time_timeseries_sort_index(self): + self.ts.sort_index() + + +class timeseries_timestamp_downsample_mean(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', end='1/1/2001', freq='T') + self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) + + def time_timeseries_timestamp_downsample_mean(self): + self.ts.resample('D', how='mean') + + +class timeseries_timestamp_tzinfo_cons(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') + + def time_timeseries_timestamp_tzinfo_cons(self): + self.rng[0] + + +class timeseries_to_datetime_YYYYMMDD(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', periods=10000, freq='D') + self.strings = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) + + def time_timeseries_to_datetime_YYYYMMDD(self): + to_datetime(self.strings, format='%Y%m%d') + + +class timeseries_to_datetime_iso8601(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', periods=20000, freq='H') + self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] + + def time_timeseries_to_datetime_iso8601(self): + to_datetime(self.strings) + + +class timeseries_to_datetime_iso8601_format(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = date_range(start='1/1/2000', periods=20000, freq='H') + self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] + + def time_timeseries_to_datetime_iso8601_format(self): + to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') + + +class timeseries_with_format_no_exact(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) + + def time_timeseries_with_format_no_exact(self): + to_datetime(self.s, format='%d%b%y', exact=False) + + +class timeseries_with_format_replace(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) + + def time_timeseries_with_format_replace(self): + to_datetime(self.s.str.replace(':\\S+$', ''), format='%d%b%y') + + +class timeseries_year_apply(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_year_apply(self): + self.year.apply(self.date) + + +class timeseries_year_incr(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.date = dt.datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() + self.day = pd.offsets.Day() + self.year = pd.offsets.YearBegin() + self.cday = pd.offsets.CustomBusinessDay() + self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) + self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) + self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + + def time_timeseries_year_incr(self): + (self.date + self.year) \ No newline at end of file diff --git a/asv_bench/vbench_to_asv.py b/asv_bench/vbench_to_asv.py new file mode 100644 index 0000000000000..b3980ffed1a57 --- /dev/null +++ b/asv_bench/vbench_to_asv.py @@ -0,0 +1,151 @@ +import ast +import vbench +import os +import sys +import astor +import glob + + +def vbench_to_asv_source(bench, kinds=None): + tab = ' ' * 4 + if kinds is None: + kinds = ['time'] + + output = 'class {}(object):\n'.format(bench.name) + output += tab + 'goal_time = 0.2\n\n' + + if bench.setup: + indented_setup = [tab * 2 + '{}\n'.format(x) for x in bench.setup.splitlines()] + output += tab + 'def setup(self):\n' + ''.join(indented_setup) + '\n' + + for kind in kinds: + output += tab + 'def {}_{}(self):\n'.format(kind, bench.name) + for line in bench.code.splitlines(): + output += tab * 2 + line + '\n' + output += '\n\n' + + if bench.cleanup: + output += tab + 'def teardown(self):\n' + tab * 2 + bench.cleanup + + output += '\n\n' + return output + + +class AssignToSelf(ast.NodeTransformer): + def __init__(self): + super(AssignToSelf, self).__init__() + self.transforms = {} + self.imports = [] + + self.in_class_define = False + self.in_setup = False + + def visit_ClassDef(self, node): + self.transforms = {} + self.in_class_define = True + self.generic_visit(node) + return node + + def visit_TryExcept(self, node): + if any([isinstance(x, (ast.Import, ast.ImportFrom)) for x in node.body]): + self.imports.append(node) + else: + self.generic_visit(node) + return node + + def visit_Assign(self, node): + for target in node.targets: + if isinstance(target, ast.Name) and not isinstance(target.ctx, ast.Param) and not self.in_class_define: + self.transforms[target.id] = 'self.' + target.id + self.generic_visit(node) + + return node + + def visit_Name(self, node): + new_node = node + if node.id in self.transforms: + if not isinstance(node.ctx, ast.Param): + new_node = ast.Attribute(value=ast.Name(id='self', ctx=node.ctx), attr=node.id, ctx=node.ctx) + + self.generic_visit(node) + + return ast.copy_location(new_node, node) + + def visit_Import(self, node): + self.imports.append(node) + + def visit_ImportFrom(self, node): + self.imports.append(node) + + def visit_FunctionDef(self, node): + """Delete functions that are empty due to imports being moved""" + self.in_class_define = False + + if self.in_setup: + node.col_offset -= 4 + ast.increment_lineno(node, -1) + + if node.name == 'setup': + self.in_setup = True + + self.generic_visit(node) + + if node.name == 'setup': + self.in_setup = False + + if node.body: + return node + + +def translate_module(target_module): + g_vars = {} + l_vars = {} + exec('import ' + target_module) in g_vars + + print target_module + module = eval(target_module, g_vars) + + benchmarks = [] + for obj_str in dir(module): + obj = getattr(module, obj_str) + if isinstance(obj, vbench.benchmark.Benchmark): + benchmarks.append(obj) + + if not benchmarks: + return + + rewritten_output = '' + for bench in benchmarks: + rewritten_output += vbench_to_asv_source(bench) + + with open('rewrite.py', 'w') as f: + f.write(rewritten_output) + + ast_module = ast.parse(rewritten_output) + + transformer = AssignToSelf() + transformed_module = transformer.visit(ast_module) + + unique_imports = {astor.to_source(node): node for node in transformer.imports} + + transformed_module.body = unique_imports.values() + transformed_module.body + + transformed_source = astor.to_source(transformed_module) + + with open('benchmarks/{}.py'.format(target_module), 'w') as f: + f.write(transformed_source) + + +if __name__ == '__main__': + cwd = os.getcwd() + new_dir = os.path.join(os.path.dirname(__file__), '../vb_suite') + sys.path.insert(0, new_dir) + + for module in glob.glob(os.path.join(new_dir, '*.py')): + mod = os.path.basename(module) + if mod in ['make.py', 'measure_memory_consumption.py', 'perf_HEAD.py', 'run_suite.py', 'test_perf.py', 'generate_rst_files.py', 'test.py', 'suite.py']: + continue + print + print mod + + translate_module(mod.replace('.py', ''))