From 89cfc2583fe7a4b6fe996343ac29e3253502d976 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Jul 2016 20:20:55 +0200 Subject: [PATCH 01/20] CLN/PERF: clean-up indexing benchmarks --- asv_bench/benchmarks/indexing.py | 190 ++++++------------------------- 1 file changed, 32 insertions(+), 158 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 094ae23a92fad..7704b9c6571b9 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -65,16 +65,6 @@ def setup(self): def time_frame_iloc_dups(self): self.df2.iloc[self.idx] - -class frame_loc_dups(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), }) - self.idx = (np.array(range(30)) * 99) - self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), }) - self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)]) - def time_frame_loc_dups(self): self.df2.loc[self.idx] @@ -105,18 +95,43 @@ def time_indexing_dataframe_boolean(self): class indexing_dataframe_boolean_no_ne(object): goal_time = 0.2 + params = [True, False] + param_names = ['use_numexpr'] + + def setup(self, use_numexpr): + if (expr is None): + raise NotImplementedError + self.df = DataFrame(np.random.randn(50000, 100)) + self.df2 = DataFrame(np.random.randn(50000, 100)) + if use_numexpr: + expr.set_numexpr_threads(1) + else: + expr.set_use_numexpr(False) + + def time_indexing_dataframe_boolean_no_ne(self, use_numexpr): + (self.df > self.df2) + + def teardown(self, use_numexpr): + if use_numexpr: + expr.set_numexpr_threads() + else: + expr.set_use_numexpr(True) + +class indexing_dataframe_boolean_st(object): + goal_time = 0.2 + def setup(self): if (expr is None): raise NotImplementedError self.df = DataFrame(np.random.randn(50000, 100)) self.df2 = DataFrame(np.random.randn(50000, 100)) - expr.set_use_numexpr(False) + expr.set_numexpr_threads(1) - def time_indexing_dataframe_boolean_no_ne(self): + def time_indexing_dataframe_boolean_st(self): (self.df > self.df2) def teardown(self): - expr.set_use_numexpr(True) + expr.set_numexpr_threads() class indexing_dataframe_boolean_rows(object): @@ -130,34 +145,10 @@ def setup(self): def time_indexing_dataframe_boolean_rows(self): self.df[self.indexer] - -class indexing_dataframe_boolean_rows_object(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.indexer = (self.df['B'] > 0) - self.obj_indexer = self.indexer.astype('O') - def time_indexing_dataframe_boolean_rows_object(self): self.df[self.obj_indexer] -class indexing_dataframe_boolean_st(object): - goal_time = 0.2 - - def setup(self): - if (expr is None): - raise NotImplementedError - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) - expr.set_numexpr_threads(1) - - def time_indexing_dataframe_boolean_st(self): - (self.df > self.df2) - - def teardown(self): - expr.set_numexpr_threads() class indexing_frame_get_value(object): @@ -173,17 +164,6 @@ def setup(self): def time_indexing_frame_get_value(self): self.df.get_value(self.idx, self.col) - -class indexing_frame_get_value_ix(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - def time_indexing_frame_get_value_ix(self): self.df.ix[(self.idx, self.col)] @@ -236,6 +216,9 @@ def setup(self): def time_series_getitem_array(self): self.s[np.arange(10000)] + def time_series_getitem_list_like(self): + self.s[[800000]] + class series_getitem_label_slice(object): goal_time = 0.2 @@ -248,29 +231,11 @@ def setup(self): def time_series_getitem_label_slice(self): self.s[:self.lbl] - -class series_getitem_list_like(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_list_like(self): - self.s[[800000]] - - -class series_getitem_pos_slice(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(1000000) - self.s = Series(np.random.rand(1000000), index=self.index) - def time_series_getitem_pos_slice(self): self.s[:800000] -class series_getitem_scalar(object): +class SeriesIndexing(object): goal_time = 0.2 def setup(self): @@ -279,133 +244,42 @@ def setup(self): def time_series_getitem_scalar(self): self.s[800000] - -class series_getitem_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_getitem_slice(self): self.s[:800000] - -class series_iloc_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_iloc_array(self): self.s.iloc[np.arange(10000)] - -class series_iloc_list_like(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_iloc_list_like(self): self.s.iloc[[800000]] - -class series_iloc_scalar(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_iloc_scalar(self): self.s.iloc[800000] - -class series_iloc_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_iloc_slice(self): self.s.iloc[:800000] - -class series_ix_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_ix_array(self): self.s.ix[np.arange(10000)] - -class series_ix_list_like(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_ix_list_like(self): self.s.ix[[800000]] - -class series_ix_scalar(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_ix_scalar(self): self.s.ix[800000] - -class series_ix_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_ix_slice(self): self.s.ix[:800000] - -class series_loc_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_loc_array(self): self.s.loc[np.arange(10000)] - -class series_loc_list_like(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_loc_list_like(self): self.s.loc[[800000]] - -class series_loc_scalar(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_loc_scalar(self): self.s.loc[800000] - -class series_loc_slice(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - def time_series_loc_slice(self): self.s.loc[:800000] From b9e6665997498215357a0ef269836902093b57ef Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Jul 2016 12:34:02 +0200 Subject: [PATCH 02/20] Clean-up parser_vb benchmarks --- asv_bench/benchmarks/parser_vb.py | 168 +++++++++--------------------- 1 file changed, 52 insertions(+), 116 deletions(-) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 6dc8bffd6dac9..32bf7e50d1a89 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -1,71 +1,49 @@ from .pandas_vb_common import * import os -from pandas import read_csv, read_table +from pandas import read_csv try: from cStringIO import StringIO except ImportError: from io import StringIO -class read_csv_comment2(object): +class read_csv1(object): goal_time = 0.2 def setup(self): - self.data = ['A,B,C'] - self.data = (self.data + (['1,2,3 # comment'] * 100000)) - self.data = '\n'.join(self.data) - - def time_read_csv_comment2(self): - read_csv(StringIO(self.data), comment='#') - - -class read_csv_default_converter(object): - goal_time = 0.2 - - def setup(self): - self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n -0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n -0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n -0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n -0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" - self.data = (self.data * 200) - - def time_read_csv_default_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) + self.N = 10000 + self.K = 8 + self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) + self.df.to_csv('test.csv', sep='|') + self.format = (lambda x: '{:,}'.format(x)) + self.df2 = self.df.applymap(self.format) + self.df2.to_csv('test2.csv', sep='|') -class read_csv_default_converter_with_decimal(object): - goal_time = 0.2 + def time_sep(self): + read_csv('test.csv', sep='|') - def setup(self): - self.data = """0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n -0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n -0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n -0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n -0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n""" - self.data = (self.data * 200) + def time_thousands(self): + read_csv('test.csv', sep='|', thousands=',') - def time_read_csv_default_converter_with_decimal(self): - read_csv(StringIO(self.data), sep=';', header=None, - float_precision=None, decimal=',') + def teardown(self): + os.remove('test.csv') + os.remove('test2.csv') -class read_csv_precise_converter(object): +class read_csv2(object): goal_time = 0.2 def setup(self): - self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n -0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n -0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n -0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n -0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" - self.data = (self.data * 200) + self.data = ['A,B,C'] + self.data = (self.data + (['1,2,3 # comment'] * 100000)) + self.data = '\n'.join(self.data) - def time_read_csv_precise_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision='high') + def time_comment(self): + read_csv(StringIO(self.data), comment='#') -class read_csv_roundtrip_converter(object): +class read_csv3(object): goal_time = 0.2 def setup(self): @@ -74,44 +52,33 @@ def setup(self): 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" + self.data2 = self.data.replace(',', ';').replace('.', ',') self.data = (self.data * 200) + self.data2 = (self.data2 * 200) - def time_read_csv_roundtrip_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, float_precision='round_trip') - - -class read_csv_thou_vb(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.format = (lambda x: '{:,}'.format(x)) - self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) - self.df = self.df.applymap(self.format) - self.df.to_csv('test.csv', sep='|') - - def time_read_csv_thou_vb(self): - read_csv('test.csv', sep='|', thousands=',') - - def teardown(self): - os.remove('test.csv') + def time_default_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision=None) + def time_default_converter_with_decimal(self): + read_csv(StringIO(self.data2), sep=';', header=None, + float_precision=None, decimal=',') -class read_csv_vb(object): - goal_time = 0.2 + def time_default_converter_python_engine(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision=None, engine='python') - def setup(self): - self.N = 10000 - self.K = 8 - self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) - self.df.to_csv('test.csv', sep='|') + def time_default_converter_with_decimal_python_engine(self): + read_csv(StringIO(self.data2), sep=';', header=None, + float_precision=None, decimal=',', engine='python') - def time_read_csv_vb(self): - read_csv('test.csv', sep='|') + def time_precise_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision='high') - def teardown(self): - os.remove('test.csv') + def time_roundtrip_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision='round_trip') class read_csv_categorical(object): @@ -125,17 +92,17 @@ def setup(self): 'c': np.random.choice(group1, N).astype('object')}) df.to_csv('strings.csv', index=False) - def time_read_csv_categorical_post(self): + def time_convert_post(self): read_csv('strings.csv').apply(pd.Categorical) - def time_read_csv_categorical_direct(self): + def time_convert_direct(self): read_csv('strings.csv', dtype='category') def teardown(self): os.remove('strings.csv') -class read_table_multiple_date(object): +class read_csv_dateparsing(object): goal_time = 0.2 def setup(self): @@ -143,43 +110,12 @@ def setup(self): self.K = 8 self.data = 'KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' self.data = (self.data * 200) + self.data2 = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' + self.data2 = (self.data2 * 200) - def time_read_table_multiple_date(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[[1, 2], [1, 3]]) - - -class read_table_multiple_date_baseline(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.data = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' - self.data = (self.data * 200) - - def time_read_table_multiple_date_baseline(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) - - -class read_csv_default_converter_python_engine(object): - goal_time = 0.2 - - def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' - self.data = (self.data * 200) - - def time_read_csv_default_converter(self): + def time_multiple_date(self): read_csv(StringIO(self.data), sep=',', header=None, - float_precision=None, engine='python') - + parse_dates=[[1, 2], [1, 3]]) -class read_csv_default_converter_with_decimal_python_engine(object): - goal_time = 0.2 - - def setup(self): - self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' - self.data = (self.data * 200) - - def time_read_csv_default_converter_with_decimal(self): - read_csv(StringIO(self.data), sep=';', header=None, - float_precision=None, decimal=',', engine='python') + def time_baseline(self): + read_csv(StringIO(self.data2), sep=',', header=None, parse_dates=[1]) From 3e05108a5038fe043fc7e453c2d6320c1af2280b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Jul 2016 13:25:24 +0200 Subject: [PATCH 03/20] Clean-up packers benchmarks --- asv_bench/benchmarks/packers.py | 778 ++++---------------------------- 1 file changed, 95 insertions(+), 683 deletions(-) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 5419571c75b43..cd43e305ead8f 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -8,28 +8,19 @@ from sqlalchemy import create_engine import numpy as np from random import randrange -from pandas.core import common as com - -class packers_read_csv(object): +class _Packers(object): goal_time = 0.2 - def setup(self): + def _setup(self): self.f = '__test__.msg' self.N = 100000 self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) + self.df2 = self.df.copy() self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] self.remove(self.f) - self.df.to_csv(self.f) - - def time_packers_read_csv(self): - pd.read_csv(self.f) def remove(self, f): try: @@ -37,22 +28,21 @@ def remove(self, f): except: pass +class Packers(_Packers): + goal_time = 0.2 + + def setup(self): + self._setup() + self.df.to_csv(self.f) + + def time_packers_read_csv(self): + pd.read_csv(self.f) -class packers_read_excel(object): +class packers_read_excel(_Packers): goal_time = 0.2 def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.bio = BytesIO() self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') self.df[:2000].to_excel(self.writer) @@ -62,246 +52,94 @@ def time_packers_read_excel(self): self.bio.seek(0) pd.read_excel(self.bio) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_hdf_store(object): +class packers_read_hdf_store(_Packers): goal_time = 0.2 def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df2.to_hdf(self.f, 'df') def time_packers_read_hdf_store(self): pd.read_hdf(self.f, 'df') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_hdf_table(object): - goal_time = 0.2 +class packers_read_hdf_table(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df2.to_hdf(self.f, 'df', format='table') def time_packers_read_hdf_table(self): pd.read_hdf(self.f, 'df') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_json(object): - goal_time = 0.2 +class packers_read_json(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df.to_json(self.f, orient='split') self.df.index = np.arange(self.N) def time_packers_read_json(self): pd.read_json(self.f, orient='split') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_json_date_index(object): - goal_time = 0.2 +class packers_read_json_date_index(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] + self._setup() self.remove(self.f) self.df.to_json(self.f, orient='split') def time_packers_read_json_date_index(self): pd.read_json(self.f, orient='split') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_pack(object): - goal_time = 0.2 +class packers_read_pack(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df2.to_msgpack(self.f) def time_packers_read_pack(self): pd.read_msgpack(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_pickle(object): - goal_time = 0.2 +class packers_read_pickle(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df2.to_pickle(self.f) def time_packers_read_pickle(self): pd.read_pickle(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_read_sql(object): - goal_time = 0.2 +class packers_read_sql(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.engine = create_engine('sqlite:///:memory:') self.df2.to_sql('table', self.engine, if_exists='replace') def time_packers_read_sql(self): pd.read_sql_table('table', self.engine) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_stata(object): - goal_time = 0.2 +class packers_read_stata(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df.to_stata(self.f, {'index': 'tc', }) def time_packers_read_stata(self): pd.read_stata(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_stata_with_validation(object): - goal_time = 0.2 +class packers_read_stata_with_validation(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] @@ -311,594 +149,168 @@ def setup(self): def time_packers_read_stata_with_validation(self): pd.read_stata(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_read_sas7bdat(object): +class packers_read_sas(_Packers): def setup(self): self.f = os.path.join(os.path.dirname(__file__), '..', '..', 'pandas', 'io', 'tests', 'sas', 'data', 'test1.sas7bdat') + self.f2 = os.path.join(os.path.dirname(__file__), '..', '..', + 'pandas', 'io', 'tests', 'sas', 'data', + 'paxraw_d_short.xpt') - def time_packers_read_sas7bdat(self): + def time_read_sas7bdat(self): pd.read_sas(self.f, format='sas7bdat') - -class packers_read_xport(object): - - def setup(self): - self.f = os.path.join(os.path.dirname(__file__), '..', '..', - 'pandas', 'io', 'tests', 'sas', 'data', - 'paxraw_d_short.xpt') - - def time_packers_read_xport(self): + def time_read_xport(self): pd.read_sas(self.f, format='xport') -class packers_write_csv(object): - goal_time = 0.2 +class CSV(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() - def time_packers_write_csv(self): + def time_write_csv(self): self.df.to_csv(self.f) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_excel_openpyxl(object): - goal_time = 0.2 +class Excel(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.bio = BytesIO() - def time_packers_write_excel_openpyxl(self): + def time_write_excel_openpyxl(self): self.bio.seek(0) self.writer = pd.io.excel.ExcelWriter(self.bio, engine='openpyxl') self.df[:2000].to_excel(self.writer) self.writer.save() - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_excel_xlsxwriter(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.bio = BytesIO() - - def time_packers_write_excel_xlsxwriter(self): + def time_write_excel_xlsxwriter(self): self.bio.seek(0) self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') self.df[:2000].to_excel(self.writer) self.writer.save() - def remove(self, f): - try: - os.remove(self.f) - except: - pass + def time_write_excel_xlwt(self): + self.bio.seek(0) + self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') + self.df[:2000].to_excel(self.writer) + self.writer.save() -class packers_write_excel_xlwt(object): - goal_time = 0.2 +class HDF(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.bio = BytesIO() - - def time_packers_write_excel_xlwt(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') - self.df[:2000].to_excel(self.writer) - self.writer.save() + self._setup() - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_hdf_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_hdf_store(self): + def time_write_hdf_store(self): self.df2.to_hdf(self.f, 'df') - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_hdf_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_hdf_table(self): + def time_write_hdf_table(self): self.df2.to_hdf(self.f, 'df', table=True) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json(object): - goal_time = 0.2 +class JSON(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() + self.df_date = self.df.copy() self.df.index = np.arange(self.N) - - def time_packers_write_json(self): - self.df.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_lines(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.remove(self.f) - self.df.index = np.arange(self.N) - - def time_packers_write_json_lines(self): - self.df.to_json(self.f, orient="records", lines=True) - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_T(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.index = np.arange(self.N) - - def time_packers_write_json_T(self): - self.df.to_json(self.f, orient='columns') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_date_index(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def time_packers_write_json_date_index(self): - self.df.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_delta_int_tstamp(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) self.cols = [(lambda i: ('{0}_timedelta'.format(i), [pd.Timedelta(('%d seconds' % randrange(1000000.0))) for _ in range(self.N)])), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_timestamp'.format(i), [pd.Timestamp((1418842918083256000 + randrange(1000000000.0, 1e+18, 200))) for _ in range(self.N)]))] self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - def time_packers_write_json_mixed_delta_int_tstamp(self): - self.df_mixed.to_json(self.f, orient='split') - - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_float_int(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - def time_packers_write_json_mixed_float_int(self): - self.df_mixed.to_json(self.f, orient='index') + self.df_mixed2 = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_json_mixed_float_int_T(object): - goal_time = 0.2 + self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))] + self.df_mixed3 = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) + def time_write_json(self): + self.df.to_json(self.f, orient='split') - def time_packers_write_json_mixed_float_int_T(self): - self.df_mixed.to_json(self.f, orient='columns') + def time_write_json_T(self): + self.df.to_json(self.f, orient='columns') - def teardown(self): - self.remove(self.f) + def time_write_json_date_index(self): + self.df_date.to_json(self.f, orient='split') - def remove(self, f): - try: - os.remove(self.f) - except: - pass + def time_write_json_mixed_delta_int_tstamp(self): + self.df_mixed.to_json(self.f, orient='split') + def time_write_json_mixed_float_int(self): + self.df_mixed2.to_json(self.f, orient='index') -class packers_write_json_mixed_float_int_str(object): - goal_time = 0.2 + def time_write_json_mixed_float_int_T(self): + self.df_mixed2.to_json(self.f, orient='columns') - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) + def time_write_json_mixed_float_int_str(self): + self.df_mixed3.to_json(self.f, orient='split') - def time_packers_write_json_mixed_float_int_str(self): - self.df_mixed.to_json(self.f, orient='split') + def time_write_json_lines(self): + self.df.to_json(self.f, orient="records", lines=True) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_pack(object): - goal_time = 0.2 +class MsgPack(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() - def time_packers_write_pack(self): + def time_write_msgpack(self): self.df2.to_msgpack(self.f) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_pickle(object): - goal_time = 0.2 +class Pickle(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() - def time_packers_write_pickle(self): + def time_write_pickle(self): self.df2.to_pickle(self.f) def teardown(self): self.remove(self.f) - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_sql(object): - goal_time = 0.2 +class SQL(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) + self._setup() self.engine = create_engine('sqlite:///:memory:') - def time_packers_write_sql(self): + def time_write_sql(self): self.df2.to_sql('table', self.engine, if_exists='replace') - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class packers_write_stata(object): - goal_time = 0.2 +class STATA(_Packers): def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_write_stata(self): - self.df.to_stata(self.f, {'index': 'tc', }) + self._setup() - def teardown(self): - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class packers_write_stata_with_validation(object): - goal_time = 0.2 + self.df3=self.df.copy() + self.df3['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] + self.df3['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] + self.df3['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] + self.df3['float32_'] = np.array(randn(self.N), dtype=np.float32) - def setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] - self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] - self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] - self.df['float32_'] = np.array(randn(self.N), dtype=np.float32) + def time_write_stata(self): self.df.to_stata(self.f, {'index': 'tc', }) - def time_packers_write_stata_with_validation(self): - self.df.to_stata(self.f, {'index': 'tc', }) + def time_write_stata_with_validation(self): + self.df3.to_stata(self.f, {'index': 'tc', }) def teardown(self): self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass From f5ccecefe39f6cc17fdff4b7caad3353a17e0faa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Jul 2016 13:38:36 +0200 Subject: [PATCH 04/20] Clean-up sql benchmarks --- asv_bench/benchmarks/io_sql.py | 202 ++++++++------------------------- 1 file changed, 46 insertions(+), 156 deletions(-) diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py index c583ac1768c90..ec855e5d33525 100644 --- a/asv_bench/benchmarks/io_sql.py +++ b/asv_bench/benchmarks/io_sql.py @@ -4,121 +4,29 @@ from sqlalchemy import create_engine -class sql_datetime_read_and_parse_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') +#------------------------------------------------------------------------------- +# to_sql - def time_sql_datetime_read_and_parse_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string']) - - -class sql_datetime_read_as_native_sqlalchemy(object): +class WriteSQL(object): goal_time = 0.2 def setup(self): self.engine = create_engine('sqlite:///:memory:') self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_datetime_read_as_native_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime']) - - -class sql_datetime_write_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_datetime_write_sqlalchemy(self): - self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace') - - -class sql_float_read_query_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_query_fallback(self): - read_sql_query('SELECT float FROM test_type', self.con) - - -class sql_float_read_query_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_query_sqlalchemy(self): - read_sql_query('SELECT float FROM test_type', self.engine) - - -class sql_float_read_table_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_sql_float_read_table_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['float']) - - -class sql_float_write_fallback(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_sql_float_write_fallback(self): - self.df[['float']].to_sql('test_float', self.con, if_exists='replace') - + self.index = tm.makeStringIndex(10000) + self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) -class sql_float_write_sqlalchemy(object): - goal_time = 0.2 + def time_fallback(self): + self.df.to_sql('test1', self.con, if_exists='replace') - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan + def time_sqlalchemy(self): + self.df.to_sql('test1', self.engine, if_exists='replace') - def time_sql_float_write_sqlalchemy(self): - self.df[['float']].to_sql('test_float', self.engine, if_exists='replace') +#------------------------------------------------------------------------------- +# read_sql -class sql_read_query_fallback(object): +class ReadSQL(object): goal_time = 0.2 def setup(self): @@ -129,41 +37,20 @@ def setup(self): self.df.to_sql('test2', self.engine, if_exists='replace') self.df.to_sql('test2', self.con, if_exists='replace') - def time_sql_read_query_fallback(self): + def time_read_query_fallback(self): read_sql_query('SELECT * FROM test2', self.con) - -class sql_read_query_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_sql_read_query_sqlalchemy(self): + def time_read_query_sqlalchemy(self): read_sql_query('SELECT * FROM test2', self.engine) - -class sql_read_table_sqlalchemy(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_sql_read_table_sqlalchemy(self): + def time_read_table_sqlalchemy(self): read_sql_table('test2', self.engine) -class sql_string_write_fallback(object): +#------------------------------------------------------------------------------- +# type specific write + +class WriteSQLTypes(object): goal_time = 0.2 def setup(self): @@ -172,44 +59,47 @@ def setup(self): self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) self.df.loc[1000:3000, 'float'] = np.nan - def time_sql_string_write_fallback(self): + def time_string_fallback(self): self.df[['string']].to_sql('test_string', self.con, if_exists='replace') + def time_string_sqlalchemy(self): + self.df[['string']].to_sql('test_string', self.engine, if_exists='replace') -class sql_string_write_sqlalchemy(object): - goal_time = 0.2 + def time_float_fallback(self): + self.df[['float']].to_sql('test_float', self.con, if_exists='replace') - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan + def time_float_sqlalchemy(self): + self.df[['float']].to_sql('test_float', self.engine, if_exists='replace') - def time_sql_string_write_sqlalchemy(self): - self.df[['string']].to_sql('test_string', self.engine, if_exists='replace') + def time_datetime_sqlalchemy(self): + self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace') -class sql_write_fallback(object): +#------------------------------------------------------------------------------- +# type specific read + +class ReadSQLTypes(object): goal_time = 0.2 def setup(self): self.engine = create_engine('sqlite:///:memory:') self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) + self.df['datetime_string'] = self.df['datetime'].map(str) + self.df.to_sql('test_type', self.engine, if_exists='replace') + self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - def time_sql_write_fallback(self): - self.df.to_sql('test1', self.con, if_exists='replace') + def time_datetime_read_and_parse_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string']) + def time_datetime_read_as_native_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['datetime']) -class sql_write_sqlalchemy(object): - goal_time = 0.2 + def time_float_read_query_fallback(self): + read_sql_query('SELECT float FROM test_type', self.con) - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) + def time_float_read_query_sqlalchemy(self): + read_sql_query('SELECT float FROM test_type', self.engine) - def time_sql_write_sqlalchemy(self): - self.df.to_sql('test1', self.engine, if_exists='replace') + def time_float_read_table_sqlalchemy(self): + read_sql_table('test_type', self.engine, columns=['float']) From a12dbfd3952ed35c31673adca256df3437be81e0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Jul 2016 14:50:03 +0200 Subject: [PATCH 05/20] Clean-up join_merge benchmarks --- asv_bench/benchmarks/join_merge.py | 354 +++++++++++------------------ 1 file changed, 131 insertions(+), 223 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index c98179c8950c5..f146ebdc41558 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,33 +1,20 @@ from .pandas_vb_common import * +try: + from pandas import merge_ordered +except ImportError: + from pandas import ordered_merge as merge_ordered -class append_frame_single_homogenous(object): - goal_time = 0.2 - def setup(self): - self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.df2 = self.df1.copy() - self.df2.index = np.arange(10000, 20000) - self.mdf1 = self.df1.copy() - self.mdf1['obj1'] = 'bar' - self.mdf1['obj2'] = 'bar' - self.mdf1['int1'] = 5 - try: - self.mdf1.consolidate(inplace=True) - except: - pass - self.mdf2 = self.mdf1.copy() - self.mdf2.index = self.df2.index - - def time_append_frame_single_homogenous(self): - self.df1.append(self.df2) +#---------------------------------------------------------------------- +# Append - -class append_frame_single_mixed(object): +class Append(object): goal_time = 0.2 def setup(self): - self.df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) + self.df1 = pd.DataFrame(np.random.randn(10000, 4), + columns=['A', 'B', 'C', 'D']) self.df2 = self.df1.copy() self.df2.index = np.arange(10000, 20000) self.mdf1 = self.df1.copy() @@ -41,33 +28,17 @@ def setup(self): self.mdf2 = self.mdf1.copy() self.mdf2.index = self.df2.index - def time_append_frame_single_mixed(self): - self.mdf1.append(self.mdf2) - - -class concat_empty_frames1(object): - goal_time = 0.2 - - def setup(self): - self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) - self.empty = pd.DataFrame() - - def time_concat_empty_frames1(self): - concat([self.df, self.empty]) - - -class concat_empty_frames2(object): - goal_time = 0.2 + def time_append_homogenous(self): + self.df1.append(self.df2) - def setup(self): - self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) - self.empty = pd.DataFrame() + def time_append_mixed(self): + self.mdf1.append(self.mdf2) - def time_concat_empty_frames2(self): - concat([self.empty, self.df]) +#---------------------------------------------------------------------- +# Concat -class concat_series_axis1(object): +class Concat(object): goal_time = 0.2 def setup(self): @@ -77,21 +48,26 @@ def setup(self): self.pieces = [self.s[i:(- i)] for i in range(1, 10)] self.pieces = (self.pieces * 50) + self.df_small = pd.DataFrame(randn(5, 4)) + + # empty + self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) + self.empty = pd.DataFrame() + def time_concat_series_axis1(self): concat(self.pieces, axis=1) + def time_concat_small_frames(self): + concat(([self.df_small] * 1000)) -class concat_small_frames(object): - goal_time = 0.2 - - def setup(self): - self.df = pd.DataFrame(randn(5, 4)) + def time_concat_empty_frames1(self): + concat([self.df, self.empty]) - def time_concat_small_frames(self): - concat(([self.df] * 1000)) + def time_concat_empty_frames2(self): + concat([self.empty, self.df]) -class concat_panels(object): +class ConcatPanels(object): goal_time = 0.2 def setup(self): @@ -101,26 +77,26 @@ def setup(self): self.panels_c = [pd.Panel(np.copy(dataset, order='C')) for i in range(20)] - def time_concat_c_ordered_axis0(self): + def time_c_ordered_axis0(self): concat(self.panels_c, axis=0, ignore_index=True) - def time_concat_f_ordered_axis0(self): + def time_f_ordered_axis0(self): concat(self.panels_f, axis=0, ignore_index=True) - def time_concat_c_ordered_axis1(self): + def time_c_ordered_axis1(self): concat(self.panels_c, axis=1, ignore_index=True) - def time_concat_f_ordered_axis1(self): + def time_f_ordered_axis1(self): concat(self.panels_f, axis=1, ignore_index=True) - def time_concat_c_ordered_axis2(self): + def time_c_ordered_axis2(self): concat(self.panels_c, axis=2, ignore_index=True) - def time_concat_f_ordered_axis2(self): + def time_f_ordered_axis2(self): concat(self.panels_f, axis=2, ignore_index=True) -class concat_dataframes(object): +class ConcatFrames(object): goal_time = 0.2 def setup(self): @@ -131,37 +107,23 @@ def setup(self): self.frames_c = [pd.DataFrame(np.copy(dataset, order='C')) for i in range(20)] - def time_concat_c_ordered_axis0(self): + def time_c_ordered_axis0(self): concat(self.frames_c, axis=0, ignore_index=True) - def time_concat_f_ordered_axis0(self): + def time_f_ordered_axis0(self): concat(self.frames_f, axis=0, ignore_index=True) - def time_concat_c_ordered_axis1(self): + def time_c_ordered_axis1(self): concat(self.frames_c, axis=1, ignore_index=True) - def time_concat_f_ordered_axis1(self): + def time_f_ordered_axis1(self): concat(self.frames_f, axis=1, ignore_index=True) -class i8merge(object): - goal_time = 0.2 +#---------------------------------------------------------------------- +# Joins - def setup(self): - (low, high, n) = (((-1) << 10), (1 << 10), (1 << 20)) - self.left = pd.DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) - self.left['left'] = self.left.sum(axis=1) - self.i = np.random.permutation(len(self.left)) - self.right = self.left.iloc[self.i].copy() - self.right.columns = (self.right.columns[:(-1)].tolist() + ['right']) - self.right.index = np.arange(len(self.right)) - self.right['right'] *= (-1) - - def time_i8merge(self): - merge(self.left, self.right, how='outer') - - -class join_dataframe_index_multi(object): +class Join(object): goal_time = 0.2 def setup(self): @@ -174,123 +136,51 @@ def setup(self): self.shuf = np.arange(100000) random.shuffle(self.shuf) try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) + self.index2 = MultiIndex(levels=[self.level1, self.level2], + labels=[self.label1, self.label2]) + self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], + labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) + self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), + index=self.index2, + columns=['A', 'B', 'C', 'D']) except: pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) + self.df = pd.DataFrame({'data1': np.random.randn(100000), + 'data2': np.random.randn(100000), + 'key1': self.key1, + 'key2': self.key2}) + self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), + index=self.level1, + columns=['A', 'B', 'C', 'D']) + self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), + index=self.level2, + columns=['A', 'B', 'C', 'D']) self.df_shuf = self.df.reindex(self.df.index[self.shuf]) def time_join_dataframe_index_multi(self): self.df.join(self.df_multi, on=['key1', 'key2']) - -class join_dataframe_index_single_key_bigger(object): - goal_time = 0.2 - - def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) - def time_join_dataframe_index_single_key_bigger(self): self.df.join(self.df_key2, on='key2') - -class join_dataframe_index_single_key_bigger_sort(object): - goal_time = 0.2 - - def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) - def time_join_dataframe_index_single_key_bigger_sort(self): self.df_shuf.join(self.df_key2, on='key2', sort=True) - -class join_dataframe_index_single_key_small(object): - goal_time = 0.2 - - def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) - def time_join_dataframe_index_single_key_small(self): self.df.join(self.df_key1, on='key1') -class join_dataframe_integer_2key(object): +class JoinIndex(object): goal_time = 0.2 def setup(self): - self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), }) - self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), }) - self.df3 = self.df[:5000] - - def time_join_dataframe_integer_2key(self): - merge(self.df, self.df3) - - -class join_dataframe_integer_key(object): - goal_time = 0.2 - - def setup(self): - self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), 'key2': np.tile(np.arange(250).repeat(10), 4), 'value': np.random.randn(10000), }) - self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500), }) - self.df3 = self.df[:5000] + np.random.seed(2718281) + self.n = 50000 + self.left = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jim', 'joe']) + self.right = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jolie', 'jolia']).set_index('jolie') - def time_join_dataframe_integer_key(self): - merge(self.df, self.df2, on='key1') + def time_left_outer_join_index(self): + self.left.join(self.right, on='jim') class merge_asof_noby(object): @@ -350,6 +240,9 @@ def time_merge_asof_by_int(self): class join_non_unique_equal(object): + # outer join of non-unique + # GH 6329 + goal_time = 0.2 def setup(self): @@ -357,28 +250,18 @@ def setup(self): self.daily_dates = self.date_index.to_period('D').to_timestamp('S', 'S') self.fracofday = (self.date_index.view(np.ndarray) - self.daily_dates.view(np.ndarray)) self.fracofday = (self.fracofday.astype('timedelta64[ns]').astype(np.float64) / 86400000000000.0) - self.fracofday = TimeSeries(self.fracofday, self.daily_dates) + self.fracofday = Series(self.fracofday, self.daily_dates) self.index = date_range(self.date_index.min().to_period('A').to_timestamp('D', 'S'), self.date_index.max().to_period('A').to_timestamp('D', 'E'), freq='D') - self.temp = TimeSeries(1.0, self.index) + self.temp = Series(1.0, self.index) def time_join_non_unique_equal(self): (self.fracofday * self.temp[self.fracofday.index]) -class left_outer_join_index(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(2718281) - self.n = 50000 - self.left = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jim', 'joe']) - self.right = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jolie', 'jolia']).set_index('jolie') - - def time_left_outer_join_index(self): - self.left.join(self.right, on='jim') - +#---------------------------------------------------------------------- +# Merges -class merge_2intkey_nosort(object): +class Merge(object): goal_time = 0.2 def setup(self): @@ -387,51 +270,73 @@ def setup(self): self.indices2 = tm.makeStringIndex(self.N).values self.key = np.tile(self.indices[:8000], 10) self.key2 = np.tile(self.indices2[:8000], 10) - self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), }) - self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), }) + self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, + 'value': np.random.randn(80000)}) + self.right = pd.DataFrame({'key': self.indices[2000:], + 'key2': self.indices2[2000:], + 'value2': np.random.randn(8000)}) + + self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), + 'key2': np.tile(np.arange(250).repeat(10), 4), + 'value': np.random.randn(10000)}) + self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500)}) + self.df3 = self.df[:5000] def time_merge_2intkey_nosort(self): merge(self.left, self.right, sort=False) + def time_merge_2intkey_sort(self): + merge(self.left, self.right, sort=True) + + def time_merge_dataframe_integer_2key(self): + merge(self.df, self.df3) + + def time_merge_dataframe_integer_key(self): + merge(self.df, self.df2, on='key1') -class merge_2intkey_sort(object): + +class i8merge(object): goal_time = 0.2 def setup(self): - self.N = 10000 - self.indices = tm.makeStringIndex(self.N).values - self.indices2 = tm.makeStringIndex(self.N).values - self.key = np.tile(self.indices[:8000], 10) - self.key2 = np.tile(self.indices2[:8000], 10) - self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, 'value': np.random.randn(80000), }) - self.right = pd.DataFrame({'key': self.indices[2000:], 'key2': self.indices2[2000:], 'value2': np.random.randn(8000), }) + (low, high, n) = (((-1) << 10), (1 << 10), (1 << 20)) + self.left = pd.DataFrame(np.random.randint(low, high, (n, 7)), + columns=list('ABCDEFG')) + self.left['left'] = self.left.sum(axis=1) + self.i = np.random.permutation(len(self.left)) + self.right = self.left.iloc[self.i].copy() + self.right.columns = (self.right.columns[:(-1)].tolist() + ['right']) + self.right.index = np.arange(len(self.right)) + self.right['right'] *= (-1) - def time_merge_2intkey_sort(self): - merge(self.left, self.right, sort=True) + def time_i8merge(self): + merge(self.left, self.right, how='outer') -class series_align_int64_index(object): - goal_time = 0.2 +#---------------------------------------------------------------------- +# Ordered merge + +class MergeOrdered(object): def setup(self): - self.n = 1000000 - self.sz = 500000 - self.rng = np.arange(0, 10000000000000, 10000000) - self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng) - self.idx1 = np.sort(self.sample(self.stamps, self.sz)) - self.idx2 = np.sort(self.sample(self.stamps, self.sz)) - self.ts1 = Series(np.random.randn(self.sz), self.idx1) - self.ts2 = Series(np.random.randn(self.sz), self.idx2) - def time_series_align_int64_index(self): - (self.ts1 + self.ts2) + groups = tm.makeStringIndex(10).values - def sample(self, values, k): - self.sampler = np.random.permutation(len(values)) - return values.take(self.sampler[:k]) + self.left = pd.DataFrame({'group': groups.repeat(5000), + 'key' : np.tile(np.arange(0, 10000, 2), 10), + 'lvalue': np.random.randn(50000)}) + self.right = pd.DataFrame({'key' : np.arange(10000), + 'rvalue' : np.random.randn(10000)}) -class series_align_left_monotonic(object): + def time_merge_ordered(self): + merge_ordered(self.left, self.right, on='key', left_by='group') + + +#---------------------------------------------------------------------- +# data alignment + +class Align(object): goal_time = 0.2 def setup(self): @@ -444,9 +349,12 @@ def setup(self): self.ts1 = Series(np.random.randn(self.sz), self.idx1) self.ts2 = Series(np.random.randn(self.sz), self.idx2) - def time_series_align_left_monotonic(self): - self.ts1.align(self.ts2, join='left') - def sample(self, values, k): self.sampler = np.random.permutation(len(values)) return values.take(self.sampler[:k]) + + def time_series_align_int64_index(self): + (self.ts1 + self.ts2) + + def time_series_align_left_monotonic(self): + self.ts1.align(self.ts2, join='left') From cc23fc93ed85a4a7dc2214b554df312c18ba2ed9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Jul 2016 21:58:12 +0200 Subject: [PATCH 06/20] Clean-up timeseries/timedelta benchmarks --- asv_bench/benchmarks/timedelta.py | 32 +- asv_bench/benchmarks/timeseries.py | 1219 ++++++---------------------- 2 files changed, 249 insertions(+), 1002 deletions(-) diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 8470525dd01fa..78b6aab9fb5c8 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -2,36 +2,24 @@ from pandas import to_timedelta, Timestamp -class timedelta_convert_int(object): +class Timedelta(object): goal_time = 0.2 def setup(self): self.arr = np.random.randint(0, 1000, size=10000) + self.arr2 = ['{0} days'.format(i) for i in self.arr] - def time_timedelta_convert_int(self): - to_timedelta(self.arr, unit='s') - - -class timedelta_convert_string(object): - goal_time = 0.2 + self.arr3 = np.random.randint(0, 60, size=10000) + self.arr3 = ['00:00:{0:02d}'.format(i) for i in self.arr3] - def setup(self): - self.arr = np.random.randint(0, 1000, size=10000) - self.arr = ['{0} days'.format(i) for i in self.arr] - - def time_timedelta_convert_string(self): - to_timedelta(self.arr) - - -class timedelta_convert_string_seconds(object): - goal_time = 0.2 + def time_convert_int(self): + to_timedelta(self.arr, unit='s') - def setup(self): - self.arr = np.random.randint(0, 60, size=10000) - self.arr = ['00:00:{0:02d}'.format(i) for i in self.arr] + def time_convert_string(self): + to_timedelta(self.arr2) - def time_timedelta_convert_string_seconds(self): - to_timedelta(self.arr) + def time_convert_string_seconds(self): + to_timedelta(self.arr3) class timedelta_convert_bad_parse(object): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 8c00924cb07ef..6e9ef4b10273c 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -10,295 +10,211 @@ from pandas.tseries.frequencies import infer_freq import numpy as np +if hasattr(Series, 'convert'): + Series.resample = Series.convert -class dataframe_resample_max_numpy(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - def time_dataframe_resample_max_numpy(self): - self.df.resample('1s', how=np.max) - - -class dataframe_resample_max_string(object): +class DatetimeIndex(object): goal_time = 0.2 def setup(self): self.N = 100000 self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - - def time_dataframe_resample_max_string(self): - self.df.resample('1s', how='max') - + self.delta_offset = pd.offsets.Day() + self.fast_offset = pd.offsets.DateOffset(months=2, days=2) + self.slow_offset = pd.offsets.BusinessDay() -class dataframe_resample_mean_numpy(object): - goal_time = 0.2 + self.rng2 = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + self.index_repeated = date_range(start='1/1/2000', periods=1000, freq='T').repeat(10) - def time_dataframe_resample_mean_numpy(self): - self.df.resample('1s', how=np.mean) + self.rng3 = date_range(start='1/1/2000', periods=1000, freq='H') + self.df = DataFrame(np.random.randn(len(self.rng3), 2), self.rng3) + self.rng4 = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') + self.df2 = DataFrame(np.random.randn(len(self.rng4), 2), index=self.rng4) -class dataframe_resample_mean_string(object): - goal_time = 0.2 + N = 100000 + self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) + self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, + tz='Asia/Tokyo').repeat(5) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + self.rng5 = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') - def time_dataframe_resample_mean_string(self): - self.df.resample('1s', how='mean') + self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') + self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') + self.index = self.index.append(self.dst_rng) + self.index = self.index.append(self.dst_rng) + self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) + self.N = 10000 + self.rng6 = date_range(start='1/1/1', periods=self.N, freq='B') -class dataframe_resample_min_numpy(object): - goal_time = 0.2 + self.rng7 = date_range(start='1/1/1700', freq='D', periods=100000) + self.a = self.rng7[:50000].append(self.rng7[50002:]) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + def time_add_timedelta(self): + (self.rng + timedelta(minutes=2)) - def time_dataframe_resample_min_numpy(self): - self.df.resample('1s', how=np.min) + def time_add_offset_delta(self): + (self.rng + self.delta_offset) + def time_add_offset_fast(self): + (self.rng + self.fast_offset) -class dataframe_resample_min_string(object): - goal_time = 0.2 + def time_add_offset_slow(self): + (self.rng + self.slow_offset) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) + def time_normalize(self): + self.rng2.normalize() - def time_dataframe_resample_min_string(self): - self.df.resample('1s', how='min') + def time_unique(self): + self.index_repeated.unique() + def time_reset_index(self): + self.df.reset_index() -class datetimeindex_add_offset(object): - goal_time = 0.2 + def time_reset_index_tz(self): + self.df2.reset_index() - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=10000, freq='T') + def time_dti_factorize(self): + self.dti.factorize() - def time_datetimeindex_add_offset(self): - (self.rng + timedelta(minutes=2)) + def time_dti_tz_factorize(self): + self.dti_tz.factorize() + def time_timestamp_tzinfo_cons(self): + self.rng5[0] -class datetimeindex_converter(object): - goal_time = 0.2 + def time_infer_dst(self): + self.index.tz_localize('US/Eastern', infer_dst=True) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) + def time_timeseries_is_month_start(self): + self.rng6.is_month_start - def time_datetimeindex_converter(self): - DatetimeConverter.convert(self.rng, None, None) + def time_infer_freq(self): + infer_freq(self.a) -class datetimeindex_infer_dst(object): +class TimeDatetimeConverter(object): goal_time = 0.2 def setup(self): self.N = 100000 self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') - self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') - self.index = self.index.append(self.dst_rng) - self.index = self.index.append(self.dst_rng) - self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) - def time_datetimeindex_infer_dst(self): - self.index.tz_localize('US/Eastern', infer_dst=True) + def time_convert(self): + DatetimeConverter.convert(self.rng, None, None) -class datetimeindex_normalize(object): +class Iteration(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') - - def time_datetimeindex_normalize(self): - self.rng.normalize() - - -class datetimeindex_unique(object): - goal_time = 0.2 + self.N = 1000000 + self.M = 10000 + self.idx1 = date_range(start='20140101', freq='T', periods=self.N) + self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='T') - self.index = self.rng.repeat(10) + def iter_n(self, iterable, n=None): + self.i = 0 + for _ in iterable: + self.i += 1 + if ((n is not None) and (self.i > n)): + break - def time_datetimeindex_unique(self): - self.index.unique() + def time_iter_datetimeindex(self): + self.iter_n(self.idx1) + def time_iter_datetimeindex_preexit(self): + self.iter_n(self.idx1, self.M) -class dti_reset_index(object): - goal_time = 0.2 + def time_iter_periodindex(self): + self.iter_n(self.idx2) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='H') - self.df = DataFrame(np.random.randn(len(self.rng), 2), self.rng) + def time_iter_periodindex_preexit(self): + self.iter_n(self.idx2, self.M) - def time_dti_reset_index(self): - self.df.reset_index() +#---------------------------------------------------------------------- +# Resampling -class dti_reset_index_tz(object): +class ResampleDataFrame(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') - self.df = DataFrame(np.random.randn(len(self.rng), 2), index=self.rng) + self.rng = date_range(start='20130101', periods=100000, freq='50L') + self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - def time_dti_reset_index_tz(self): - self.df.reset_index() + def time_max_numpy(self): + self.df.resample('1s', how=np.max) + def time_max_string(self): + self.df.resample('1s', how='max') -class datetime_algorithm(object): - goal_time = 0.2 + def time_mean_numpy(self): + self.df.resample('1s', how=np.mean) - def setup(self): - N = 100000 - self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) - self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, - tz='Asia/Tokyo').repeat(5) + def time_mean_string(self): + self.df.resample('1s', how='mean') - def time_dti_factorize(self): - self.dti.factorize() + def time_min_numpy(self): + self.df.resample('1s', how=np.min) - def time_dti_tz_factorize(self): - self.dti_tz.factorize() + def time_min_string(self): + self.df.resample('1s', how='min') -class timeseries_1min_5min_mean(object): +class ResampleSeries(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_timeseries_1min_5min_mean(self): - self.ts[:10000].resample('5min', how='mean') + self.rng1 = period_range(start='1/1/2000', end='1/1/2001', freq='T') + self.ts1 = Series(np.random.randn(len(self.rng1)), index=self.rng1) + self.rng2 = date_range(start='1/1/2000', end='1/1/2001', freq='T') + self.ts2 = Series(np.random.randn(len(self.rng2)), index=self.rng2) -class timeseries_1min_5min_ohlc(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng3 = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') + self.int_ts = Series(5, self.rng3, dtype='int64') + self.dt_ts = self.int_ts.astype('datetime64[ns]') - def time_timeseries_1min_5min_ohlc(self): - self.ts[:10000].resample('5min', how='ohlc') + def time_period_downsample_mean(self): + self.ts1.resample('D', how='mean') + def time_timestamp_downsample_mean(self): + self.ts2.resample('D', how='mean') -class timeseries_add_irregular(object): - goal_time = 0.2 + def time_resample_datetime64(self): + # GH 7754 + self.dt_ts.resample('1S', how='last') - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.lindex = np.random.permutation(self.N)[:(self.N // 2)] - self.rindex = np.random.permutation(self.N)[:(self.N // 2)] - self.left = Series(self.ts.values.take(self.lindex), index=self.ts.index.take(self.lindex)) - self.right = Series(self.ts.values.take(self.rindex), index=self.ts.index.take(self.rindex)) + def time_1min_5min_mean(self): + self.ts2[:10000].resample('5min', how='mean') - def time_timeseries_add_irregular(self): - (self.left + self.right) + def time_1min_5min_ohlc(self): + self.ts2[:10000].resample('5min', how='ohlc') -class timeseries_asof(object): +class AsOf(object): goal_time = 0.2 def setup(self): self.N = 10000 self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') self.ts = Series(np.random.randn(self.N), index=self.rng) + self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') self.ts2 = self.ts.copy() self.ts2[250:5000] = np.nan self.ts3 = self.ts.copy() self.ts3[-5000:] = np.nan # test speed of pre-computing NAs. - def time_asof_list(self): + def time_asof(self): self.ts.asof(self.dates) # should be roughly the same as above. - def time_asof_nan_list(self): + def time_asof_nan(self): self.ts2.asof(self.dates) # test speed of the code path for a scalar index @@ -318,7 +234,7 @@ def time_asof_nan_single(self): self.ts3.asof(self.dates[-1]) -class timeseries_dataframe_asof(object): +class AsOfDataFrame(object): goal_time = 0.2 def setup(self): @@ -333,11 +249,11 @@ def setup(self): self.ts3.iloc[-5000:] = np.nan # test speed of pre-computing NAs. - def time_asof_list(self): + def time_asof(self): self.ts.asof(self.dates) # should be roughly the same as above. - def time_asof_nan_list(self): + def time_asof_nan(self): self.ts2.asof(self.dates) # test speed of the code path for a scalar index @@ -356,107 +272,105 @@ def time_asof_single_early(self): self.ts.asof(self.dates[0] - dt.timedelta(10)) -class timeseries_custom_bday_apply(object): +class TimeSeries(object): goal_time = 0.2 def setup(self): self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert + self.rng = date_range(start='1/1/2000', periods=self.N, freq='s') + self.rng = self.rng.take(np.random.permutation(self.N)) self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - def time_timeseries_custom_bday_apply(self): - self.cday.apply(self.date) + self.rng2 = date_range(start='1/1/2000', periods=self.N, freq='T') + self.ts2 = Series(np.random.randn(self.N), index=self.rng2) + self.lindex = np.random.permutation(self.N)[:(self.N // 2)] + self.rindex = np.random.permutation(self.N)[:(self.N // 2)] + self.left = Series(self.ts2.values.take(self.lindex), index=self.ts2.index.take(self.lindex)) + self.right = Series(self.ts2.values.take(self.rindex), index=self.ts2.index.take(self.rindex)) -class timeseries_custom_bday_apply_dt64(object): - goal_time = 0.2 + self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S') + self.ts3 = Series(1, index=self.rng3) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + def time_sort_index(self): + self.ts.sort_index() - def time_timeseries_custom_bday_apply_dt64(self): - self.cday.apply(self.dt64) + def time_timeseries_slice_minutely(self): + self.ts2[:10000] + def time_add_irregular(self): + (self.left + self.right) + + def time_large_lookup_value(self): + self.ts3[self.ts3.index[(len(self.ts3) // 2)]] + self.ts3.index._cleanup() -class timeseries_custom_bday_cal_decr(object): + +class SeriesArithmetic(object): goal_time = 0.2 def setup(self): self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) + self.delta_offset = pd.offsets.Day() + self.fast_offset = pd.offsets.DateOffset(months=2, days=2) + self.slow_offset = pd.offsets.BusinessDay() + + def time_add_offset_delta(self): + (self.s + self.delta_offset) - def time_timeseries_custom_bday_cal_decr(self): - (self.date - (1 * self.cdayh)) + def time_add_offset_fast(self): + (self.s + self.fast_offset) + + def time_add_offset_slow(self): + (self.s + self.slow_offset) -class timeseries_custom_bday_cal_incr(object): +class ToDatetime(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + self.rng = date_range(start='1/1/2000', periods=10000, freq='D') + self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) + + self.rng = date_range(start='1/1/2000', periods=20000, freq='H') + self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] + self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng] + self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' + for x in self.rng] + + self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) + self.s2 = self.s.str.replace(':\\S+$', '') - def time_timeseries_custom_bday_cal_incr(self): - (self.date + (1 * self.cdayh)) + def time_format_YYYYMMDD(self): + to_datetime(self.stringsD, format='%Y%m%d') + def time_iso8601(self): + to_datetime(self.strings) + + def time_iso8601_nosep(self): + to_datetime(self.strings_nosep) + + def time_iso8601_format(self): + to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') + + def time_iso8601_format_no_sep(self): + to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S') + + def time_iso8601_tz_spaceformat(self): + to_datetime(self.strings_tz_space) + + def time_format_exact(self): + to_datetime(self.s2, format='%d%b%y') + + def time_format_no_exact(self): + to_datetime(self.s, format='%d%b%y', exact=False) -class timeseries_custom_bday_cal_incr_n(object): + +class Offsets(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) self.date = dt.datetime(2011, 1, 1) self.dt64 = np.datetime64('2011-01-01 09:00Z') self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() @@ -467,718 +381,63 @@ def setup(self): self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - def time_timeseries_custom_bday_cal_incr_n(self): - (self.date + (10 * self.cdayh)) - - -class timeseries_custom_bday_cal_incr_neg_n(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + def time_timeseries_day_apply(self): + self.day.apply(self.date) - def time_timeseries_custom_bday_cal_incr_neg_n(self): - (self.date - (10 * self.cdayh)) + def time_timeseries_day_incr(self): + (self.date + self.day) + def time_timeseries_year_apply(self): + self.year.apply(self.date) -class timeseries_custom_bday_decr(object): - goal_time = 0.2 + def time_timeseries_year_incr(self): + (self.date + self.year) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + # custom business offsets - def time_timeseries_custom_bday_decr(self): + def time_custom_bday_decr(self): (self.date - self.cday) - -class timeseries_custom_bday_incr(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bday_incr(self): + def time_custom_bday_incr(self): (self.date + self.cday) + def time_custom_bday_apply(self): + self.cday.apply(self.date) -class timeseries_custom_bmonthbegin_decr_n(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthbegin_decr_n(self): - (self.date - (10 * self.cmb)) - - -class timeseries_custom_bmonthbegin_incr_n(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthbegin_incr_n(self): - (self.date + (10 * self.cmb)) - - -class timeseries_custom_bmonthend_decr_n(object): - goal_time = 0.2 + def time_custom_bday_apply_dt64(self): + self.cday.apply(self.dt64) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + def time_custom_bday_cal_incr(self): + self.date + 1 * self.cdayh - def time_timeseries_custom_bmonthend_decr_n(self): - (self.date - (10 * self.cme)) + def time_custom_bday_cal_decr(self): + self.date - 1 * self.cdayh + def time_custom_bday_cal_incr_n(self): + self.date + 10 * self.cdayh -class timeseries_custom_bmonthend_incr(object): - goal_time = 0.2 + def time_custom_bday_cal_incr_neg_n(self): + self.date - 10 * self.cdayh - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + # Increment custom business month - def time_timeseries_custom_bmonthend_incr(self): + def time_custom_bmonthend_incr(self): (self.date + self.cme) - -class timeseries_custom_bmonthend_incr_n(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_custom_bmonthend_incr_n(self): + def time_custom_bmonthend_incr_n(self): (self.date + (10 * self.cme)) + def time_custom_bmonthend_decr_n(self): + (self.date - (10 * self.cme)) -class timeseries_datetimeindex_offset_delta(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_delta(self): - (self.idx1 + self.delta_offset) - - -class timeseries_datetimeindex_offset_fast(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_fast(self): - (self.idx1 + self.fast_offset) - - -class timeseries_datetimeindex_offset_slow(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_datetimeindex_offset_slow(self): - (self.idx1 + self.slow_offset) - - -class timeseries_day_apply(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_day_apply(self): - self.day.apply(self.date) - - -class timeseries_day_incr(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_day_incr(self): - (self.date + self.day) - - -class timeseries_infer_freq(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/1700', freq='D', periods=100000) - self.a = self.rng[:50000].append(self.rng[50002:]) - - def time_timeseries_infer_freq(self): - infer_freq(self.a) - - -class timeseries_is_month_start(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 10000 - self.rng = date_range(start='1/1/1', periods=self.N, freq='B') - - def time_timeseries_is_month_start(self): - self.rng.is_month_start - - -class timeseries_iter_datetimeindex(object): - goal_time = 0.2 + def time_custom_bmonthbegin_decr_n(self): + (self.date - (10 * self.cmb)) - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_datetimeindex(self): - self.iter_n(self.idx1) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - -class timeseries_iter_datetimeindex_preexit(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_datetimeindex_preexit(self): - self.iter_n(self.idx1, self.M) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - -class timeseries_iter_periodindex(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_periodindex(self): - self.iter_n(self.idx2) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - -class timeseries_iter_periodindex_preexit(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def time_timeseries_iter_periodindex_preexit(self): - self.iter_n(self.idx2, self.M) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - -class timeseries_large_lookup_value(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=1500000, freq='S') - self.ts = Series(1, index=self.rng) - - def time_timeseries_large_lookup_value(self): - self.ts[self.ts.index[(len(self.ts) // 2)]] - self.ts.index._cleanup() - - -class timeseries_period_downsample_mean(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = period_range(start='1/1/2000', end='1/1/2001', freq='T') - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - - def time_timeseries_period_downsample_mean(self): - self.ts.resample('D', how='mean') - - -class timeseries_resample_datetime64(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') - self.int_ts = Series(5, self.rng, dtype='int64') - self.ts = self.int_ts.astype('datetime64[ns]') - - def time_timeseries_resample_datetime64(self): - self.ts.resample('1S', how='last') - - -class timeseries_series_offset_delta(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_delta(self): - (self.s + self.delta_offset) - - -class timeseries_series_offset_fast(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_fast(self): - (self.s + self.fast_offset) - - -class timeseries_series_offset_slow(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_timeseries_series_offset_slow(self): - (self.s + self.slow_offset) - - -class timeseries_slice_minutely(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_timeseries_slice_minutely(self): - self.ts[:10000] - - -class timeseries_sort_index(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='s') - self.rng = self.rng.take(np.random.permutation(self.N)) - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_timeseries_sort_index(self): - self.ts.sort_index() - - -class timeseries_timestamp_downsample_mean(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', end='1/1/2001', freq='T') - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - - def time_timeseries_timestamp_downsample_mean(self): - self.ts.resample('D', how='mean') - - -class timeseries_timestamp_tzinfo_cons(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') - - def time_timeseries_timestamp_tzinfo_cons(self): - self.rng[0] - - -class timeseries_to_datetime_YYYYMMDD(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=10000, freq='D') - self.strings = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) - - def time_timeseries_to_datetime_YYYYMMDD(self): - to_datetime(self.strings, format='%Y%m%d') - - -class timeseries_to_datetime_iso8601(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] - self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng] - self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' - for x in self.rng] - - def time_timeseries_to_datetime_iso8601(self): - to_datetime(self.strings) - - def time_timeseries_to_datetime_iso8601_nosep(self): - to_datetime(self.strings_nosep) - - def time_timeseries_to_datetime_iso8601_format(self): - to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') - - def time_timeseries_to_datetime_iso8601_format_no_sep(self): - to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S') - - def time_timeseries_to_datetime_iso8601_tz_spaceformat(self): - to_datetime(self.strings_tz_space) - - -class timeseries_with_format_no_exact(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) - - def time_timeseries_with_format_no_exact(self): - to_datetime(self.s, format='%d%b%y', exact=False) - - -class timeseries_with_format_replace(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) - - def time_timeseries_with_format_replace(self): - to_datetime(self.s.str.replace(':\\S+$', ''), format='%d%b%y') - - -class timeseries_year_apply(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_year_apply(self): - self.year.apply(self.date) - - -class timeseries_year_incr(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) - - def time_timeseries_year_incr(self): - (self.date + self.year) + def time_custom_bmonthbegin_incr_n(self): + (self.date + (10 * self.cmb)) -class timeseries_semi_month_offset(object): +class SemiMonthOffset(object): goal_time = 0.2 def setup(self): @@ -1189,50 +448,50 @@ def setup(self): self.semi_month_end = pd.offsets.SemiMonthEnd() self.semi_month_begin = pd.offsets.SemiMonthBegin() - def time_semi_month_end_apply(self): + def time_end_apply(self): self.semi_month_end.apply(self.date) - def time_semi_month_end_incr(self): + def time_end_incr(self): self.date + self.semi_month_end - def time_semi_month_end_incr_n(self): + def time_end_incr_n(self): self.date + 10 * self.semi_month_end - def time_semi_month_end_decr(self): + def time_end_decr(self): self.date - self.semi_month_end - def time_semi_month_end_decr_n(self): + def time_end_decr_n(self): self.date - 10 * self.semi_month_end - def time_semi_month_end_apply_index(self): + def time_end_apply_index(self): self.semi_month_end.apply_index(self.rng) - def time_semi_month_end_incr_rng(self): + def time_end_incr_rng(self): self.rng + self.semi_month_end - def time_semi_month_end_decr_rng(self): + def time_end_decr_rng(self): self.rng - self.semi_month_end - def time_semi_month_begin_apply(self): + def time_begin_apply(self): self.semi_month_begin.apply(self.date) - def time_semi_month_begin_incr(self): + def time_begin_incr(self): self.date + self.semi_month_begin - def time_semi_month_begin_incr_n(self): + def time_begin_incr_n(self): self.date + 10 * self.semi_month_begin - def time_semi_month_begin_decr(self): + def time_begin_decr(self): self.date - self.semi_month_begin - def time_semi_month_begin_decr_n(self): + def time_begin_decr_n(self): self.date - 10 * self.semi_month_begin - def time_semi_month_begin_apply_index(self): + def time_begin_apply_index(self): self.semi_month_begin.apply_index(self.rng) - def time_semi_month_begin_incr_rng(self): + def time_begin_incr_rng(self): self.rng + self.semi_month_begin - def time_semi_month_begin_decr_rng(self): + def time_begin_decr_rng(self): self.rng - self.semi_month_begin From d1ecbe8f1885311d7509348a8670621f04cb124b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Jul 2016 22:40:58 +0200 Subject: [PATCH 07/20] Clean-up string benchmarks --- asv_bench/benchmarks/strings.py | 368 ++++---------------------------- 1 file changed, 41 insertions(+), 327 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d64606214ca6a..c1600d4e07f58 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -4,390 +4,104 @@ import pandas.util.testing as testing -class strings_cat(object): +class StringMethods(object): goal_time = 0.2 - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_cat(self): - self.many.str.cat(sep=',') - def make_series(self, letters, strlen, size): return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - -class strings_center(object): - goal_time = 0.2 - def setup(self): self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + self.s = self.make_series(string.ascii_uppercase, strlen=10, size=10000).str.join('|') - def time_strings_center(self): - self.many.str.center(100) - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_few(object): - goal_time = 0.2 + def time_cat(self): + self.many.str.cat(sep=',') - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def time_center(self): + self.many.str.center(100) - def time_strings_contains_few(self): + def time_contains_few(self): self.few.str.contains('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_few_noregex(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_contains_few_noregex(self): + def time_contains_few_noregex(self): self.few.str.contains('matchthis', regex=False) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_many(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_contains_many(self): + def time_contains_many(self): self.many.str.contains('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_contains_many_noregex(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_contains_many_noregex(self): + def time_contains_many_noregex(self): self.many.str.contains('matchthis', regex=False) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_count(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_count(self): + def time_count(self): self.many.str.count('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_encode_decode(object): - goal_time = 0.2 - - def setup(self): - self.ser = Series(testing.makeUnicodeIndex()) - - def time_strings_encode_decode(self): - self.ser.str.encode('utf-8').str.decode('utf-8') - - -class strings_endswith(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_endswith(self): + def time_endswith(self): self.many.str.endswith('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_extract(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_extract(self): + def time_extract(self): self.many.str.extract('(\\w*)matchthis(\\w*)') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_findall(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_findall(self): + def time_findall(self): self.many.str.findall('[A-Z]+') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_get(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_get(self): + def time_get(self): self.many.str.get(0) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_get_dummies(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - self.s = self.make_series(string.ascii_uppercase, strlen=10, size=10000).str.join('|') - - def time_strings_get_dummies(self): - self.s.str.get_dummies('|') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_join_split(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_join_split(self): + def time_join_split(self): self.many.str.join('--').str.split('--') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_join_split_expand(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_join_split_expand(self): + def time_join_split_expand(self): self.many.str.join('--').str.split('--', expand=True) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_len(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_len(self): + def time_len(self): self.many.str.len() - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_lower(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_lower(self): - self.many.str.lower() - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_lstrip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_lstrip(self): - self.many.str.lstrip('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_match(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_match(self): + def time_match(self): self.many.str.match('mat..this') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_pad(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_pad(self): + def time_pad(self): self.many.str.pad(100, side='both') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_repeat(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_repeat(self): + def time_repeat(self): self.many.str.repeat(list(IT.islice(IT.cycle(range(1, 4)), len(self.many)))) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_replace(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_replace(self): + def time_replace(self): self.many.str.replace('(matchthis)', '\x01\x01') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_rstrip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_rstrip(self): - self.many.str.rstrip('matchthis') - - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_slice(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_slice(self): + def time_slice(self): self.many.str.slice(5, 15, 2) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_startswith(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_startswith(self): + def time_startswith(self): self.many.str.startswith('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) - - -class strings_strip(object): - goal_time = 0.2 - - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_strip(self): + def time_strip(self): self.many.str.strip('matchthis') - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_rstrip(self): + self.many.str.rstrip('matchthis') + def time_lstrip(self): + self.many.str.lstrip('matchthis') -class strings_title(object): - goal_time = 0.2 + def time_title(self): + self.many.str.title() - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) + def time_upper(self): + self.many.str.upper() - def time_strings_title(self): - self.many.str.title() + def time_lower(self): + self.many.str.lower() - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_get_dummies(self): + self.s.str.get_dummies('|') -class strings_upper(object): +class StringEncode(object): goal_time = 0.2 def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - - def time_strings_upper(self): - self.many.str.upper() + self.ser = Series(testing.makeUnicodeIndex()) - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + def time_encode_decode(self): + self.ser.str.encode('utf-8').str.decode('utf-8') From 7aad492bf19a8598a89e5ded2c0accd810fa5494 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 28 Aug 2016 12:29:13 +0200 Subject: [PATCH 08/20] Clean-up merge asof benchmarks --- asv_bench/benchmarks/join_merge.py | 98 +++++++++++++----------------- 1 file changed, 42 insertions(+), 56 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index f146ebdc41558..9eefe80c8e5e4 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -183,62 +183,6 @@ def time_left_outer_join_index(self): self.left.join(self.right, on='jim') -class merge_asof_noby(object): - - def setup(self): - np.random.seed(0) - one_count = 200000 - two_count = 1000000 - self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), - 'value1': np.random.randn(one_count)}) - self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), - 'value2': np.random.randn(two_count)}) - self.df1 = self.df1.sort_values('time') - self.df2 = self.df2.sort_values('time') - - def time_merge_asof_noby(self): - merge_asof(self.df1, self.df2, on='time') - - -class merge_asof_by_object(object): - - def setup(self): - import string - np.random.seed(0) - one_count = 200000 - two_count = 1000000 - self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), - 'key': np.random.choice(list(string.uppercase), one_count), - 'value1': np.random.randn(one_count)}) - self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), - 'key': np.random.choice(list(string.uppercase), two_count), - 'value2': np.random.randn(two_count)}) - self.df1 = self.df1.sort_values('time') - self.df2 = self.df2.sort_values('time') - - def time_merge_asof_by_object(self): - merge_asof(self.df1, self.df2, on='time', by='key') - - -class merge_asof_by_int(object): - - def setup(self): - np.random.seed(0) - one_count = 200000 - two_count = 1000000 - self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), - 'key': np.random.randint(0, 25, one_count), - 'value1': np.random.randn(one_count)}) - self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), - 'key': np.random.randint(0, 25, two_count), - 'value2': np.random.randn(two_count)}) - self.df1 = self.df1.sort_values('time') - self.df2 = self.df2.sort_values('time') - - def time_merge_asof_by_int(self): - merge_asof(self.df1, self.df2, on='time', by='key') - - class join_non_unique_equal(object): # outer join of non-unique # GH 6329 @@ -333,6 +277,48 @@ def time_merge_ordered(self): merge_ordered(self.left, self.right, on='key', left_by='group') +# ---------------------------------------------------------------------- +# asof merge + +class MergeAsof(object): + + def setup(self): + import string + np.random.seed(0) + one_count = 200000 + two_count = 1000000 + + self.df1 = pd.DataFrame( + {'time': np.random.randint(0, one_count / 20, one_count), + 'key': np.random.choice(list(string.uppercase), one_count), + 'key2': np.random.randint(0, 25, one_count), + 'value1': np.random.randn(one_count)}) + self.df2 = pd.DataFrame( + {'time': np.random.randint(0, two_count / 20, two_count), + 'key': np.random.choice(list(string.uppercase), two_count), + 'key2': np.random.randint(0, 25, two_count), + 'value2': np.random.randn(two_count)}) + + self.df1 = self.df1.sort_values('time') + self.df2 = self.df2.sort_values('time') + + self.df1a = self.df1[['time', 'value1']] + self.df2a = self.df2[['time', 'value2']] + self.df1b = self.df1[['time', 'key', 'value1']] + self.df2b = self.df2[['time', 'key', 'value2']] + self.df1c = self.df1[['time', 'key2', 'value1']] + self.df2c = self.df2[['time', 'key2', 'value2']] + + def time_noby(self): + merge_asof(self.df1a, self.df2a, on='time') + + def time_by_object(self): + merge_asof(self.df1b, self.df2b, on='time', by='key') + + def time_by_int(self): + merge_asof(self.df1c, self.df2c, on='time', by='key2') + + #---------------------------------------------------------------------- # data alignment From 731f50c5007e5b2cb52b57bc7de0610650ed70be Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 28 Aug 2016 13:31:30 +0200 Subject: [PATCH 09/20] Clean-up algos/attrs/binary_ops benchmarks --- asv_bench/benchmarks/algorithms.py | 12 +- asv_bench/benchmarks/attrs_caching.py | 14 +- asv_bench/benchmarks/binary_ops.py | 238 +++++--------------------- asv_bench/benchmarks/inference.py | 9 +- 4 files changed, 56 insertions(+), 217 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 53b7d55368f6a..7f427b7a2e4fc 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -3,7 +3,7 @@ from pandas.util import testing as tm -class algorithm(object): +class Algorithms(object): goal_time = 0.2 def setup(self): @@ -24,19 +24,19 @@ def setup(self): self.arrneg = np.arange(-1000000, 0) self.arrmixed = np.array([1, -1]).repeat(500000) - def time_int_factorize(self): + def time_factorize_int(self): self.int.factorize() - def time_float_factorize(self): + def time_factorize_float(self): self.int.factorize() - def time_int_unique_duplicated(self): + def time_duplicated_int_unique(self): self.int_unique.duplicated() - def time_int_duplicated(self): + def time_duplicated_int(self): self.int.duplicated() - def time_float_duplicated(self): + def time_duplicated_float(self): self.float.duplicated() def time_add_overflow_pos_scalar(self): diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index de9aa18937985..1f41d24fd4cbc 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,23 +1,15 @@ from .pandas_vb_common import * -class getattr_dataframe_index(object): +class DataFrameAttributes(object): goal_time = 0.2 def setup(self): self.df = DataFrame(np.random.randn(10, 6)) self.cur_index = self.df.index - def time_getattr_dataframe_index(self): + def time_get_index(self): self.foo = self.df.index - -class setattr_dataframe_index(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(10, 6)) - self.cur_index = self.df.index - - def time_setattr_dataframe_index(self): + def time_set_index(self): self.df.index = self.cur_index diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index d22d01f261b27..938f59db37df6 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -2,193 +2,76 @@ import pandas.computation.expressions as expr -class frame_add(object): +class Ops(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - - def time_frame_add(self): - (self.df + self.df2) + params = [[True, False], ['default', 1]] + param_names = ['use_numexpr', 'threads'] - -class frame_add_no_ne(object): - goal_time = 0.2 - - def setup(self): + def setup(self, use_numexpr, threads): self.df = DataFrame(np.random.randn(20000, 100)) self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - def time_frame_add_no_ne(self): - (self.df + self.df2) - - def teardown(self): - expr.set_use_numexpr(True) + if threads != 'default': + expr.set_numexpr_threads(threads) + if not use_numexpr: + expr.set_use_numexpr(False) -class frame_add_st(object): - goal_time = 0.2 + def time_frame_add(self, use_numexpr, threads): + (self.df + self.df2) - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) + def time_frame_mult(self, use_numexpr, threads): + (self.df * self.df2) - def time_frame_add_st(self): - (self.df + self.df2) + def time_frame_multi_and(self, use_numexpr, threads): + self.df[((self.df > 0) & (self.df2 > 0))] - def teardown(self): + def teardown(self, use_numexpr, threads): + expr.set_use_numexpr(True) expr.set_numexpr_threads() -class frame_float_div(object): +class Ops2(object): goal_time = 0.2 def setup(self): self.df = DataFrame(np.random.randn(1000, 1000)) self.df2 = DataFrame(np.random.randn(1000, 1000)) - def time_frame_float_div(self): - (self.df // self.df2) + self.df_int = DataFrame( + np.random.random_integers(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(1000, 1000))) + self.df2_int = DataFrame( + np.random.random_integers(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(1000, 1000))) + ## Division -class frame_float_div_by_zero(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) + def time_frame_float_div(self): + (self.df // self.df2) def time_frame_float_div_by_zero(self): (self.df / 0) - -class frame_float_floor_by_zero(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - def time_frame_float_floor_by_zero(self): (self.df // 0) - -class frame_float_mod(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - self.df2 = DataFrame(np.random.randn(1000, 1000)) - - def time_frame_float_mod(self): - (self.df / self.df2) - - -class frame_int_div_by_zero(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) - def time_frame_int_div_by_zero(self): - (self.df / 0) + (self.df_int / 0) - -class frame_int_mod(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) - self.df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) + ## Modulo def time_frame_int_mod(self): (self.df / self.df2) - -class frame_mult(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - - def time_frame_mult(self): - (self.df * self.df2) - - -class frame_mult_no_ne(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - - def time_frame_mult_no_ne(self): - (self.df * self.df2) - - def teardown(self): - expr.set_use_numexpr(True) - - -class frame_mult_st(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_frame_mult_st(self): - (self.df * self.df2) - - def teardown(self): - expr.set_numexpr_threads() - - -class frame_multi_and(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - - def time_frame_multi_and(self): - self.df[((self.df > 0) & (self.df2 > 0))] - - -class frame_multi_and_no_ne(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_use_numexpr(False) - - def time_frame_multi_and_no_ne(self): - self.df[((self.df > 0) & (self.df2 > 0))] - - def teardown(self): - expr.set_use_numexpr(True) - - -class frame_multi_and_st(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_frame_multi_and_st(self): - self.df[((self.df > 0) & (self.df2 > 0))] - - def teardown(self): - expr.set_numexpr_threads() + def time_frame_float_mod(self): + (self.df / self.df2) -class series_timestamp_compare(object): +class Timeseries(object): goal_time = 0.2 def setup(self): @@ -197,65 +80,28 @@ def setup(self): self.s = Series(date_range('20010101', periods=self.N, freq='T')) self.ts = self.s[self.halfway] + self.s2 = Series(date_range('20010101', periods=self.N, freq='s')) + def time_series_timestamp_compare(self): (self.s <= self.ts) - -class timestamp_ops_diff1(object): - goal_time = 0.2 - N = 1000000 - - def setup(self): - self.s = self.create() - - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s')) + def time_timestamp_series_compare(self): + (self.ts >= self.s) def time_timestamp_ops_diff1(self): - self.s.diff() - -class timestamp_tz_ops_diff1(timestamp_ops_diff1): - N = 10000 - - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) - -class timestamp_ops_diff2(object): - goal_time = 0.2 - N = 1000000 - - def setup(self): - self.s = self.create() - - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s')) + self.s2.diff() def time_timestamp_ops_diff2(self): (self.s - self.s.shift()) -class timestamp_tz_ops_diff2(timestamp_ops_diff2): - N = 10000 - def create(self): - return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) -class timestamp_series_compare(object): - goal_time = 0.2 - N = 1000000 +class TimeseriesTZ(Timeseries): def setup(self): + self.N = 1000000 self.halfway = ((self.N // 2) - 1) - self.s = self.create() + self.s = Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) self.ts = self.s[self.halfway] - def create(self): - return Series(date_range('20010101', periods=self.N, freq='T')) - - def time_timestamp_series_compare(self): - (self.ts >= self.s) - -class timestamp_tz_series_compare(timestamp_series_compare): - N = 10000 - - def create(self): - return Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) + self.s2 = Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) \ No newline at end of file diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 2e394ed4268f3..136d5fc12d3e8 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -146,14 +146,15 @@ class to_numeric(object): [None, 'integer', 'signed', 'unsigned', 'float']] N = 500000 + N2 = int(N / 2) data_dict = { - 'string-int': (['1'] * (N // 2)) + ([2] * (N // 2)), - 'string-nint': (['-1'] * (N // 2)) + ([2] * (N // 2)), + 'string-int': (['1'] * N2) + ([2] * N2), + 'string-nint': (['-1'] * N2) + ([2] * N2), 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], dtype='datetime64[D]'), N), - 'string-float': (['1.1'] * (N // 2)) + ([2] * (N // 2)), - 'int-list': ([1] * (N // 2)) + ([2] * (N // 2)), + 'string-float': (['1.1'] * N2) + ([2] * N2), + 'int-list': ([1] * N2) + ([2] * N2), 'int32': np.repeat(np.int32(1), N) } From f174d8df130033da96ddc58a5834fa8936da536b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 28 Aug 2016 15:15:13 +0200 Subject: [PATCH 10/20] Clean-up categorical benchmarks --- asv_bench/benchmarks/categoricals.py | 92 ++++++++++------------------ 1 file changed, 33 insertions(+), 59 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index bf1e1b3f40ab0..cca652c68cf15 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -3,89 +3,63 @@ from pandas.types.concat import union_categoricals except ImportError: pass -import string -class concat_categorical(object): +class Categoricals(object): goal_time = 0.2 def setup(self): - self.s = pd.Series((list('aabbcd') * 1000000)).astype('category') + N = 100000 + self.s = pd.Series((list('aabbcd') * N)).astype('category') - def time_concat_categorical(self): - concat([self.s, self.s]) + self.a = pd.Categorical((list('aabbcd') * N)) + self.b = pd.Categorical((list('bbcdjk') * N)) + self.categories = list('abcde') + self.cat_idx = Index(self.categories) + self.values = np.tile(self.categories, N) + self.codes = np.tile(range(len(self.categories)), N) -class union_categorical(object): - goal_time = 0.2 + self.datetimes = pd.Series(pd.date_range( + '1995-01-01 00:00:00', periods=10000, freq='s')) - def setup(self): - self.a = pd.Categorical((list('aabbcd') * 1000000)) - self.b = pd.Categorical((list('bbcdjk') * 1000000)) + def time_concat(self): + concat([self.s, self.s]) - def time_union_categorical(self): + def time_union(self): union_categoricals([self.a, self.b]) - -class categorical_value_counts(object): - goal_time = 1 - - def setup(self): - n = 500000 - np.random.seed(2718281) - arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] - self.ts = Series(arr).astype('category') - - def time_value_counts(self): - self.ts.value_counts(dropna=False) - - def time_value_counts_dropna(self): - self.ts.value_counts(dropna=True) - - -class categorical_constructor(object): - goal_time = 0.2 - - def setup(self): - n = 5 - N = 1e6 - self.categories = list(string.ascii_letters[:n]) - self.cat_idx = Index(self.categories) - self.values = np.tile(self.categories, N) - self.codes = np.tile(range(n), N) - - def time_regular_constructor(self): + def time_constructor_regular(self): Categorical(self.values, self.categories) - def time_fastpath(self): + def time_constructor_fastpath(self): Categorical(self.codes, self.cat_idx, fastpath=True) - -class categorical_constructor_with_datetimes(object): - goal_time = 0.2 - - def setup(self): - self.datetimes = pd.Series(pd.date_range( - '1995-01-01 00:00:00', periods=10000, freq='s')) - - def time_datetimes(self): + def time_constructor_datetimes(self): Categorical(self.datetimes) - def time_datetimes_with_nat(self): + def time_constructor_datetimes_with_nat(self): t = self.datetimes t.iloc[-1] = pd.NaT Categorical(t) -class categorical_rendering(object): - goal_time = 3e-3 +class Categoricals2(object): + goal_time = 0.2 def setup(self): - n = 1000 - items = [str(i) for i in range(n)] - s = pd.Series(items, dtype='category') - df = pd.DataFrame({'C': s, 'data': np.random.randn(n)}) - self.data = df[df.C == '20'] + n = 500000 + np.random.seed(2718281) + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.ts = Series(arr).astype('category') + + self.sel = self.ts.loc[[0]] + + def time_value_counts(self): + self.ts.value_counts(dropna=False) + + def time_value_counts_dropna(self): + self.ts.value_counts(dropna=True) def time_rendering(self): - str(self.data.C) + str(self.sel) From cce95ee0f667a3de60923b0636df487f20571872 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 28 Aug 2016 16:23:03 +0200 Subject: [PATCH 11/20] Clean-up constructor benchmarks --- asv_bench/benchmarks/ctors.py | 46 +- asv_bench/benchmarks/frame_ctor.py | 1728 ++-------------------------- 2 files changed, 92 insertions(+), 1682 deletions(-) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index f68cf9399c546..b5694a3a21502 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,52 +1,30 @@ from .pandas_vb_common import * -class frame_constructor_ndarray(object): +class Constructors(object): goal_time = 0.2 def setup(self): self.arr = np.random.randn(100, 100) + self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) - def time_frame_constructor_ndarray(self): - DataFrame(self.arr) - - -class ctor_index_array_string(object): - goal_time = 0.2 - - def setup(self): - self.data = np.array(['foo', 'bar', 'baz'], dtype=object) - - def time_ctor_index_array_string(self): - Index(self.data) - - -class series_constructor_ndarray(object): - goal_time = 0.2 - - def setup(self): self.data = np.random.randn(100) self.index = Index(np.arange(100)) - def time_series_constructor_ndarray(self): - Series(self.data, index=self.index) + self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), + Timestamp('20130101')] * 1000)) + def time_frame_from_ndarray(self): + DataFrame(self.arr) -class dtindex_from_series_ctor(object): - goal_time = 0.2 + def time_series_from_ndarray(self): + pd.Series(self.data, index=self.index) - def setup(self): - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000)) + def time_index_from_array_string(self): + Index(self.arr_str) - def time_dtindex_from_series_ctor(self): + def time_dtindex_from_series(self): DatetimeIndex(self.s) - -class index_from_series_ctor(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * 1000)) - - def time_index_from_series_ctor(self): + def time_dtindex_from_series2(self): Index(self.s) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 6f40611e68531..05c1a27fdf8ca 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -5,1617 +5,10 @@ from pandas.core.datetools import * -class frame_ctor_dtindex_BDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BMonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BMonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BMonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BQuarterEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BQuarterEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BQuarterEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BYearEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BYearEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BYearEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessHourx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessHour(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessHourx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_BusinessHourx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(BusinessHour(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_BusinessHourx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CBMonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CBMonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CBMonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CustomBusinessDayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CustomBusinessDay(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CustomBusinessDayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_CustomBusinessDayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(CustomBusinessDay(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_CustomBusinessDayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_DateOffsetx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(DateOffset(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_DateOffsetx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_DateOffsetx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(DateOffset(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_DateOffsetx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Dayx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Day(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Dayx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Dayx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Day(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Dayx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Easterx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Easter(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Easterx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Easterx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Easter(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Easterx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx1__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx1__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(1, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx1__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx2__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx2__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253Quarter(2, **{'startingMonth': 1, 'qtr_with_extra_week': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253Quarterx2__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x1__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x1__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x1__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(1, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x1__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x2__variation_last(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'last', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x2__variation_last(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_FY5253x2__variation_nearest(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(FY5253(2, **{'startingMonth': 1, 'weekday': 1, 'variation': 'nearest', })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_FY5253x2__variation_nearest(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Hourx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Hour(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Hourx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Hourx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Hour(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Hourx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_LastWeekOfMonthx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(LastWeekOfMonth(1, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_LastWeekOfMonthx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_LastWeekOfMonthx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(LastWeekOfMonth(2, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_LastWeekOfMonthx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Microx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Micro(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Microx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Microx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Micro(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Microx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Millix1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Milli(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Millix1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Millix2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Milli(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Millix2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Minutex1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Minute(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Minutex1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Minutex2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Minute(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Minutex2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) +#---------------------------------------------------------------------- +# Creation from nested dict - -class frame_ctor_dtindex_MonthEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_MonthEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(MonthEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_MonthEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Nanox1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Nano(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Nanox1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Nanox2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Nano(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Nanox2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_QuarterEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(QuarterEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_QuarterEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Secondx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Second(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Secondx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Secondx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Second(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Secondx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_WeekOfMonthx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(WeekOfMonth(1, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_WeekOfMonthx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_WeekOfMonthx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(WeekOfMonth(2, **{'week': 1, 'weekday': 1, })) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_WeekOfMonthx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Weekx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Week(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Weekx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_Weekx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(Week(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_Weekx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearBeginx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(YearBegin(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearBeginx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearBeginx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(YearBegin(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearBeginx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearEndx1(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(YearEnd(1, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearEndx1(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_dtindex_YearEndx2(object): - goal_time = 0.2 - - def setup(self): - self.idx = self.get_index_for_offset(YearEnd(2, **{})) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) - - def time_frame_ctor_dtindex_YearEndx2(self): - DataFrame(self.d) - - def get_period_count(self, start_date, off): - self.ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (self.ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // self.ten_offsets_in_days)), 1000) - - def get_index_for_offset(self, off): - self.start_date = Timestamp('1/1/1900') - return date_range(self.start_date, periods=min(1000, self.get_period_count(self.start_date, off)), freq=off) - - -class frame_ctor_list_of_dict(object): +class FromDicts(object): goal_time = 0.2 def setup(self): @@ -1630,39 +23,26 @@ def setup(self): self.some_dict = self.data.values()[0] self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] + self.data2 = dict( + ((i, dict(((j, float(j)) for j in range(100)))) for i in + xrange(2000))) + def time_frame_ctor_list_of_dict(self): DataFrame(self.dict_list) - -class frame_ctor_nested_dict(object): - goal_time = 0.2 - - def setup(self): - (N, K) = (5000, 50) - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() - self.some_dict = self.data.values()[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] - def time_frame_ctor_nested_dict(self): DataFrame(self.data) - -class frame_ctor_nested_dict_int64(object): - goal_time = 0.2 - - def setup(self): - self.data = dict(((i, dict(((j, float(j)) for j in range(100)))) for i in xrange(2000))) + def time_series_ctor_from_dict(self): + Series(self.some_dict) def time_frame_ctor_nested_dict_int64(self): + # nested dict, integer indexes, regression described in #621 DataFrame(self.data) +# from a mi-series + class frame_from_series(object): goal_time = 0.2 @@ -1670,10 +50,13 @@ def setup(self): self.mi = MultiIndex.from_tuples([(x, y) for x in range(100) for y in range(100)]) self.s = Series(randn(10000), index=self.mi) - def time_frame_from_series(self): + def time_frame_from_mi_series(self): DataFrame(self.s) +#---------------------------------------------------------------------- +# get_numeric_data + class frame_get_numeric_data(object): goal_time = 0.2 @@ -1687,20 +70,69 @@ def time_frame_get_numeric_data(self): self.df._get_numeric_data() -class series_ctor_from_dict(object): - goal_time = 0.2 +# ---------------------------------------------------------------------- +# From dict with DatetimeIndex with all offsets - def setup(self): - (N, K) = (5000, 50) - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() - self.some_dict = self.data.values()[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] +# dynamically generate benchmarks for every offset +# +# get_period_count & get_index_for_offset are there because blindly taking each +# offset times 1000 can easily go out of Timestamp bounds and raise errors. - def time_series_ctor_from_dict(self): - Series(self.some_dict) + +def get_period_count(start_date, off): + ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days + if (ten_offsets_in_days == 0): + return 1000 + else: + return min((9 * ((Timestamp.max - start_date).days // ten_offsets_in_days)), 1000) + + +def get_index_for_offset(off): + start_date = Timestamp('1/1/1900') + return date_range(start_date, periods=min(1000, get_period_count( + start_date, off)), freq=off) + + +all_offsets = offsets.__all__ +# extra cases +for off in ['FY5253', 'FY5253Quarter']: + all_offsets.pop(all_offsets.index(off)) + all_offsets.extend([off + '_1', off + '_2']) + + +class FrameConstructorDTIndexFromOffsets(object): + + params = [all_offsets, [1, 2]] + param_names = ['offset', 'n_steps'] + + offset_kwargs = {'WeekOfMonth': {'weekday': 1, 'week': 1}, + 'LastWeekOfMonth': {'weekday': 1, 'week': 1}, + 'FY5253': {'startingMonth': 1, 'weekday': 1}, + 'FY5253Quarter': {'qtr_with_extra_week': 1, 'startingMonth': 1, 'weekday': 1}} + + offset_extra_cases = {'FY5253': {'variation': ['nearest', 'last']}, + 'FY5253Quarter': {'variation': ['nearest', 'last']}} + + def setup(self, offset, n_steps): + + extra = False + if offset.endswith("_", None, -1): + extra = int(offset[-1]) + offset = offset[:-2] + + kwargs = {} + if offset in self.offset_kwargs: + kwargs = self.offset_kwargs[offset] + + if extra: + extras = self.offset_extra_cases[offset] + for extra_arg in extras: + kwargs[extra_arg] = extras[extra_arg][extra -1] + + offset = getattr(offsets, offset) + self.idx = get_index_for_offset(offset(n_steps, **kwargs)) + self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) + self.d = dict([(col, self.df[col]) for col in self.df.columns]) + + def time_frame_ctor(self, offset, n_steps): + DataFrame(self.d) From f0fbaed516efbc5ed19836fd89b8ac3c90a543a6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 28 Aug 2016 17:07:53 +0200 Subject: [PATCH 12/20] Clean-up eval + panel benchmarks --- asv_bench/benchmarks/eval.py | 41 +++++++----------------- asv_bench/benchmarks/panel_ctor.py | 8 ++--- asv_bench/benchmarks/panel_methods.py | 46 ++++----------------------- 3 files changed, 22 insertions(+), 73 deletions(-) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index d9978e0cc4595..a0819e33dc254 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -3,7 +3,7 @@ import pandas.computation.expressions as expr -class eval_frame(object): +class Eval(object): goal_time = 0.2 params = [['numexpr', 'python'], [1, 'all']] @@ -34,8 +34,11 @@ def time_mult(self, engine, threads): df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 pd.eval('df * df2 * df3 * df4', engine=engine) + def teardown(self, engine, threads): + expr.set_numexpr_threads() -class query_datetime_index(object): + +class Query(object): goal_time = 0.2 def setup(self): @@ -45,41 +48,19 @@ def setup(self): self.s = Series(self.index) self.ts = self.s.iloc[self.halfway] self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index) + self.df2 = DataFrame({'dates': self.s.values,}) + + self.df3 = DataFrame({'a': np.random.randn(self.N),}) + self.min_val = self.df3['a'].min() + self.max_val = self.df3['a'].max() def time_query_datetime_index(self): ts = self.ts self.df.query('index < @ts') - -class query_datetime_series(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.df = DataFrame({'dates': self.s.values, }) - def time_query_datetime_series(self): ts = self.ts - self.df.query('dates < @ts') - - -class query_with_boolean_selection(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.N = 1000000 - self.df = DataFrame({'a': np.random.randn(self.N), }) - self.min_val = self.df['a'].min() - self.max_val = self.df['a'].max() + self.df2.query('dates < @ts') def time_query_with_boolean_selection(self): min_val, max_val = self.min_val, self.max_val diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index 4f6fd4a5a2df8..faedce6c574ec 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -1,7 +1,7 @@ from .pandas_vb_common import * -class panel_from_dict_all_different_indexes(object): +class Constructors1(object): goal_time = 0.2 def setup(self): @@ -18,7 +18,7 @@ def time_panel_from_dict_all_different_indexes(self): Panel.from_dict(self.data_frames) -class panel_from_dict_equiv_indexes(object): +class Constructors2(object): goal_time = 0.2 def setup(self): @@ -32,7 +32,7 @@ def time_panel_from_dict_equiv_indexes(self): Panel.from_dict(self.data_frames) -class panel_from_dict_same_index(object): +class Constructors3(object): goal_time = 0.2 def setup(self): @@ -46,7 +46,7 @@ def time_panel_from_dict_same_index(self): Panel.from_dict(self.data_frames) -class panel_from_dict_two_different_indexes(object): +class Constructors4(object): goal_time = 0.2 def setup(self): diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index 0bd572db2211a..ebe278f6e68b5 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -1,56 +1,24 @@ from .pandas_vb_common import * -class panel_pct_change_items(object): +class PanelMethods(object): goal_time = 0.2 def setup(self): self.index = date_range(start='2000', freq='D', periods=1000) self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - def time_panel_pct_change_items(self): + def time_pct_change_items(self): self.panel.pct_change(1, axis='items') - -class panel_pct_change_major(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_pct_change_major(self): + def time_pct_change_major(self): self.panel.pct_change(1, axis='major') - -class panel_pct_change_minor(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_pct_change_minor(self): + def time_pct_change_minor(self): self.panel.pct_change(1, axis='minor') - -class panel_shift(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_shift(self): + def time_shift(self): self.panel.shift(1) - -class panel_shift_minor(object): - goal_time = 0.2 - - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - - def time_panel_shift_minor(self): - self.panel.shift(1, axis='minor') + def time_shift_minor(self): + self.panel.shift(1, axis='minor') \ No newline at end of file From 65d720a8d200a17f00da9f41034d47f36f71e2f6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Sep 2016 09:28:18 +0200 Subject: [PATCH 13/20] Clean-up frame_methods benchmarks --- asv_bench/benchmarks/frame_methods.py | 1081 ++++++++----------------- 1 file changed, 334 insertions(+), 747 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index df73a474b2683..b0788891a3418 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -2,444 +2,72 @@ import string -class frame_apply_axis_1(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_axis_1(self): - self.df.apply((lambda x: (x + 1)), axis=1) - - -class frame_apply_lambda_mean(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_lambda_mean(self): - self.df.apply((lambda x: x.sum())) - - -class frame_apply_np_mean(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_np_mean(self): - self.df.apply(np.mean) - - -class frame_apply_pass_thru(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100)) - - def time_frame_apply_pass_thru(self): - self.df.apply((lambda x: x)) - - -class frame_apply_ref_by_name(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) - - def time_frame_apply_ref_by_name(self): - self.df.apply((lambda x: (x['A'] + x['B'])), axis=1) - - -class frame_apply_user_func(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.arange(1028.0)) - self.df = DataFrame({i: self.s for i in range(1028)}) - - def time_frame_apply_user_func(self): - self.df.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) - - -class frame_assign_timeseries_index(object): - goal_time = 0.2 - - def setup(self): - self.idx = date_range('1/1/2000', periods=100000, freq='D') - self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - - def time_frame_assign_timeseries_index(self): - self.f(self.df) - - def f(self, df): - self.x = self.df.copy() - self.x['date'] = self.x.index - - -class frame_boolean_row_select(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.bool_arr = np.zeros(10000, dtype=bool) - self.bool_arr[:1000] = True - - def time_frame_boolean_row_select(self): - self.df[self.bool_arr] - - -class frame_count_level_axis0_mixed_dtypes_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis0_mixed_dtypes_multi(self): - self.df.count(axis=0, level=1) - - -class frame_count_level_axis0_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis0_multi(self): - self.df.count(axis=0, level=1) - - -class frame_count_level_axis1_mixed_dtypes_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis1_mixed_dtypes_multi(self): - self.df.count(axis=1, level=1) - - -class frame_count_level_axis1_multi(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df.index = MultiIndex.from_tuples(self.df.index.map((lambda x: (x, x)))) - self.df.columns = MultiIndex.from_tuples(self.df.columns.map((lambda x: (x, x)))) - - def time_frame_count_level_axis1_multi(self): - self.df.count(axis=1, level=1) - - -class frame_dropna_axis0_all(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis0_all(self): - self.df.dropna(how='all', axis=0) - - -class frame_dropna_axis0_all_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis0_all_mixed_dtypes(self): - self.df.dropna(how='all', axis=0) - - -class frame_dropna_axis0_any(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis0_any(self): - self.df.dropna(how='any', axis=0) - - -class frame_dropna_axis0_any_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis0_any_mixed_dtypes(self): - self.df.dropna(how='any', axis=0) - - -class frame_dropna_axis1_all(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis1_all(self): - self.df.dropna(how='all', axis=1) - - -class frame_dropna_axis1_all_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis1_all_mixed_dtypes(self): - self.df.dropna(how='all', axis=1) - - -class frame_dropna_axis1_any(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - - def time_frame_dropna_axis1_any(self): - self.df.dropna(how='any', axis=1) - - -class frame_dropna_axis1_any_mixed_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan - self.df['foo'] = 'bar' - - def time_frame_dropna_axis1_any_mixed_dtypes(self): - self.df.dropna(how='any', axis=1) - - -class frame_dtypes(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - - def time_frame_dtypes(self): - self.df.dtypes - - -class frame_duplicated(object): - goal_time = 0.2 - - def setup(self): - self.n = (1 << 20) - self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) - self.xs = np.random.randn((self.n // 64)).round(2) - self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) - - def time_frame_duplicated(self): - self.df.duplicated() - -class frame_duplicated_wide(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 100).astype(str)) - - def time_frame_duplicated_wide(self): - self.df.T.duplicated() - -class frame_fancy_lookup(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) - self.df['foo'] = 'bar' - self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') - - def time_frame_fancy_lookup(self): - self.df.lookup(self.row_labels, self.col_labels) - - -class frame_fancy_lookup_all(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) - self.df['foo'] = 'bar' - self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') - - def time_frame_fancy_lookup_all(self): - self.df.lookup(self.row_labels_all, self.col_labels_all) - - -class frame_fillna_inplace(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan - - def time_frame_fillna_inplace(self): - self.df.fillna(0, inplace=True) - +#---------------------------------------------------------------------- +# lookup -class frame_float_equal(object): +class frame_fancy_lookup(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_float_equal(self): - self.test_equal('float_df') + self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + self.df['foo'] = 'bar' + self.row_labels = list(self.df.index[::10])[:900] + self.col_labels = (list(self.df.columns) * 100) + self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') + self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + def time_frame_fancy_lookup(self): + self.df.lookup(self.row_labels, self.col_labels) - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_frame_fancy_lookup_all(self): + self.df.lookup(self.row_labels_all, self.col_labels_all) - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) +#---------------------------------------------------------------------- +# reindex -class frame_float_unequal(object): +class Reindex(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_float_unequal(self): - self.test_unequal('float_df') - - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) - - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) - - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) - - -class frame_from_records_generator(object): - goal_time = 0.2 - - def time_frame_from_records_generator(self): - self.df = DataFrame.from_records(self.get_data()) - - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) - - -class frame_from_records_generator_nrows(object): - goal_time = 0.2 + self.df = DataFrame(randn(10000, 10000)) + self.idx = np.arange(4000, 7000) - def time_frame_from_records_generator_nrows(self): - self.df = DataFrame.from_records(self.get_data(), nrows=1000) + self.df2 = DataFrame( + dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), + 1: randint(0, 1000, 1000).astype( + np.int16), + 2: randint(0, 1000, 1000).astype( + np.int32), + 3: randint(0, 1000, 1000).astype( + np.int64),}[randint(0, 4)]) for c in + range(1000)])) + + def time_reindex_axis0(self): + self.df.reindex(self.idx) - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) + def time_reindex_axis1(self): + self.df.reindex(columns=self.idx) + def time_reindex_both_axes(self): + self.df.reindex(index=self.idx, columns=self.idx) -class frame_get_dtype_counts(object): - goal_time = 0.2 + def time_reindex_both_axes_ix(self): + self.df.ix[(self.idx, self.idx)] - def setup(self): - self.df = DataFrame(np.random.randn(10, 10000)) + def time_reindex_upcast(self): + self.df2.reindex(permutation(range(1200))) - def time_frame_get_dtype_counts(self): - self.df.get_dtype_counts() +#---------------------------------------------------------------------- +# iteritems (monitor no-copying behaviour) -class frame_getitem_single_column(object): +class Iteration(object): goal_time = 0.2 def setup(self): self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def time_frame_getitem_single_column(self): - self.h() + self.df2 = DataFrame(np.random.randn(50000, 10)) def f(self): if hasattr(self.df, '_item_cache'): @@ -451,290 +79,240 @@ def g(self): for (name, col) in self.df.iteritems(): pass - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] - - -class frame_getitem_single_column2(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def time_frame_getitem_single_column2(self): - self.j() + def time_iteritems(self): + self.f() - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass + def time_iteritems_cached(self): + self.g() - def g(self): - for (name, col) in self.df.iteritems(): + def time_itertuples(self): + for row in self.df2.itertuples(): pass - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] +#---------------------------------------------------------------------- +# to_string, to_html, repr -class frame_html_repr_trunc_mi(object): +class Formatting(object): goal_time = 0.2 def setup(self): - self.nrows = 10000 - self.data = randn(self.nrows, 10) - self.idx = MultiIndex.from_arrays(np.tile(randn(3, (self.nrows / 100)), 100)) - self.df = DataFrame(self.data, index=self.idx) - - def time_frame_html_repr_trunc_mi(self): - self.df._repr_html_() - + self.df = DataFrame(randn(100, 10)) -class frame_html_repr_trunc_si(object): - goal_time = 0.2 + self.nrows = 500 + self.df2 = DataFrame(randn(self.nrows, 10)) + self.df2[0] = period_range('2000', '2010', self.nrows) + self.df2[1] = range(self.nrows) - def setup(self): self.nrows = 10000 self.data = randn(self.nrows, 10) + self.idx = MultiIndex.from_arrays(np.tile(randn(3, int(self.nrows / 100)), 100)) + self.df3 = DataFrame(self.data, index=self.idx) self.idx = randn(self.nrows) - self.df = DataFrame(self.data, index=self.idx) - - def time_frame_html_repr_trunc_si(self): - self.df._repr_html_() - - -class frame_insert_100_columns_begin(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000 + self.df4 = DataFrame(self.data, index=self.idx) - def time_frame_insert_100_columns_begin(self): - self.f() + self.df_tall = pandas.DataFrame(np.random.randn(10000, 10)) - def f(self, K=100): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df.insert(0, i, self.new_col) + self.df_wide = pandas.DataFrame(np.random.randn(10, 10000)) + def time_to_string_floats(self): + self.df.to_string() -class frame_insert_500_columns_end(object): - goal_time = 0.2 + def time_to_html_mixed(self): + self.df2.to_html() - def setup(self): - self.N = 1000 + def time_html_repr_trunc_mi(self): + self.df3._repr_html_() - def time_frame_insert_500_columns_end(self): - self.f() + def time_html_repr_trunc_si(self): + self.df4._repr_html_() - def f(self, K=500): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df[i] = self.new_col + def time_repr_tall(self): + repr(self.df_tall) + def time_frame_repr_wide(self): + repr(self.df_wide) -class frame_interpolate(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan +#---------------------------------------------------------------------- +# nulls/masking - def time_frame_interpolate(self): - self.df.interpolate() +## masking -class frame_interpolate_some_good(object): +class frame_mask_bools(object): goal_time = 0.2 def setup(self): - self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), }) - self.df.loc[1::5, 'A'] = np.nan - self.df.loc[1::5, 'C'] = np.nan - - def time_frame_interpolate_some_good(self): - self.df.interpolate() - + self.data = np.random.randn(1000, 500) + self.df = DataFrame(self.data) + self.df = self.df.where((self.df > 0)) + self.bools = (self.df > 0) + self.mask = isnull(self.df) -class frame_interpolate_some_good_infer(object): - goal_time = 0.2 + def time_frame_mask_bools(self): + self.bools.mask(self.mask) - def setup(self): - self.df = DataFrame({'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), 'C': randn(10000), 'D': randn(10000), }) - self.df.loc[1::5, 'A'] = np.nan - self.df.loc[1::5, 'C'] = np.nan + def time_frame_mask_floats(self): + self.bools.astype(float).mask(self.mask) - def time_frame_interpolate_some_good_infer(self): - self.df.interpolate(downcast='infer') +## isnull -class frame_isnull_floats_no_null(object): +class FrameIsnull(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 1000) - self.df = DataFrame(self.data) - - def time_frame_isnull(self): - isnull(self.df) - - -class frame_isnull_floats(object): - goal_time = 0.2 + self.df_no_null = DataFrame(np.random.randn(1000, 1000)) - def setup(self): np.random.seed(1234) self.sample = np.array([np.nan, 1.0]) self.data = np.random.choice(self.sample, (1000, 1000)) self.df = DataFrame(self.data) - def time_frame_isnull(self): - isnull(self.df) - - -class frame_isnull_strings(object): - goal_time = 0.2 - - def setup(self): np.random.seed(1234) self.sample = np.array(list(string.ascii_lowercase) + list(string.ascii_uppercase) + list(string.whitespace)) self.data = np.random.choice(self.sample, (1000, 1000)) - self.df = DataFrame(self.data) - - def time_frame_isnull(self): - isnull(self.df) - + self.df_strings= DataFrame(self.data) -class frame_isnull_obj(object): - goal_time = 0.2 - - def setup(self): np.random.seed(1234) self.sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) self.data = np.random.choice(self.sample, (1000, 1000)) - self.df = DataFrame(self.data) + self.df_obj = DataFrame(self.data) - def time_frame_isnull(self): + def time_isnull_floats_no_null(self): + isnull(self.df_no_null) + + def time_isnull(self): isnull(self.df) + def time_isnull_strngs(self): + isnull(self.df_strings) + + def time_isnull_obj(self): + isnull(self.df_obj) + + +# ---------------------------------------------------------------------- +# fillna in place -class frame_iteritems(object): +class frame_fillna_inplace(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) + self.df = DataFrame(randn(10000, 100)) + self.df.values[::2] = np.nan - def time_frame_iteritems(self): - self.f() + def time_frame_fillna_inplace(self): + self.df.fillna(0, inplace=True) - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass - def g(self): - for (name, col) in self.df.iteritems(): - pass +class Dropna(object): + goal_time = 0.2 - def h(self): - for i in range(10000): - self.df2['A'] + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df_mixed = self.df.copy() + self.df_mixed['foo'] = 'bar' - def j(self): - for i in range(10000): - self.df3[0] + self.df_mi = self.df.copy() + self.df_mi.index = MultiIndex.from_tuples(self.df_mi.index.map((lambda x: (x, x)))) + self.df_mi.columns = MultiIndex.from_tuples(self.df_mi.columns.map((lambda x: (x, x)))) + self.df_mixed_mi = self.df_mixed.copy() + self.df_mixed_mi.index = MultiIndex.from_tuples(self.df_mixed_mi.index.map((lambda x: (x, x)))) + self.df_mixed_mi.columns = MultiIndex.from_tuples(self.df_mixed_mi.columns.map((lambda x: (x, x)))) -class frame_iteritems_cached(object): - goal_time = 0.2 + def time_dropna_axis0_all(self): + self.df.dropna(how='all', axis=0) - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) + def time_dropna_axis0_any(self): + self.df.dropna(how='any', axis=0) - def time_frame_iteritems_cached(self): - self.g() + def time_dropna_axis1_all(self): + self.df.dropna(how='all', axis=1) - def f(self): - if hasattr(self.df, '_item_cache'): - self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass + def time_dropna_axis1_any(self): + self.df.dropna(how='any', axis=1) - def g(self): - for (name, col) in self.df.iteritems(): - pass + def time_dropna_axis0_all_mixed_dtypes(self): + self.df_mixed.dropna(how='all', axis=0) - def h(self): - for i in range(10000): - self.df2['A'] + def time_dropna_axis0_any_mixed_dtypes(self): + self.df_mixed.dropna(how='any', axis=0) + + def time_dropna_axis1_all_mixed_dtypes(self): + self.df_mixed.dropna(how='all', axis=1) - def j(self): - for i in range(10000): - self.df3[0] + def time_dropna_axis1_any_mixed_dtypes(self): + self.df_mixed.dropna(how='any', axis=1) + def time_count_level_axis0_multi(self): + self.df_mi.count(axis=0, level=1) -class frame_itertuples(object): + def time_count_level_axis1_multi(self): + self.df_mi.count(axis=1, level=1) - def setup(self): - self.df = DataFrame(np.random.randn(50000, 10)) + def time_count_level_axis0_mixed_dtypes_multi(self): + self.df_mixed_mi.count(axis=0, level=1) - def time_frame_itertuples(self): - for row in self.df.itertuples(): - pass + def time_count_level_axis1_mixed_dtypes_multi(self): + self.df_mixed_mi.count(axis=1, level=1) -class frame_mask_bools(object): +class Apply(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + self.df = DataFrame(np.random.randn(1000, 100)) - def time_frame_mask_bools(self): - self.bools.mask(self.mask) + self.s = Series(np.arange(1028.0)) + self.df2 = DataFrame({i: self.s for i in range(1028)}) + + self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + def time_apply_user_func(self): + self.df2.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) -class frame_mask_floats(object): + def time_apply_axis_1(self): + self.df.apply((lambda x: (x + 1)), axis=1) + + def time_apply_lambda_mean(self): + self.df.apply((lambda x: x.sum())) + + def time_apply_np_mean(self): + self.df.apply(np.mean) + + def time_apply_pass_thru(self): + self.df.apply((lambda x: x)) + + def time_apply_ref_by_name(self): + self.df3.apply((lambda x: (x['A'] + x['B'])), axis=1) + + +#---------------------------------------------------------------------- +# dtypes + +class frame_dtypes(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + self.df = DataFrame(np.random.randn(1000, 1000)) - def time_frame_mask_floats(self): - self.bools.astype(float).mask(self.mask) + def time_frame_dtypes(self): + self.df.dtypes +#---------------------------------------------------------------------- +# equals -class frame_nonunique_equal(object): +class Equals(object): goal_time = 0.2 def setup(self): @@ -742,10 +320,9 @@ def setup(self): self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) self.nonunique_cols = self.object_df.copy() self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_nonunique_equal(self): - self.test_equal('nonunique_cols') + self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in ( + ('float_df', self.float_df), ('object_df', self.object_df), + ('nonunique_cols', self.nonunique_cols))]) def make_pair(self, frame): self.df = frame @@ -761,238 +338,246 @@ def test_unequal(self, name): (self.df, self.df2) = self.pairs[name] return self.df.equals(self.df2) + def time_frame_float_equal(self): + self.test_equal('float_df') -class frame_nonunique_unequal(object): - goal_time = 0.2 + def time_frame_float_unequal(self): + self.test_unequal('float_df') - def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + def time_frame_nonunique_equal(self): + self.test_equal('nonunique_cols') def time_frame_nonunique_unequal(self): self.test_unequal('nonunique_cols') - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) - - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_frame_object_equal(self): + self.test_equal('object_df') - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + def time_frame_object_unequal(self): + self.test_unequal('object_df') -class frame_object_equal(object): +class Interpolate(object): goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) + # this is the worst case, where every column has NaNs. + self.df = DataFrame(randn(10000, 100)) + self.df.values[::2] = np.nan - def time_frame_object_equal(self): - self.test_equal('object_df') + self.df2 = DataFrame( + {'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), + 'C': randn(10000), 'D': randn(10000),}) + self.df2.loc[1::5, 'A'] = np.nan + self.df2.loc[1::5, 'C'] = np.nan - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + def time_interpolate(self): + self.df.interpolate() - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_interpolate_some_good(self): + self.df2.interpolate() - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + def time_interpolate_some_good_infer(self): + self.df2.interpolate(downcast='infer') -class frame_object_unequal(object): +class Shift(object): + # frame shift speedup issue-5609 goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in (('float_df', self.float_df), ('object_df', self.object_df), ('nonunique_cols', self.nonunique_cols))]) - - def time_frame_object_unequal(self): - self.test_unequal('object_df') + self.df = DataFrame(np.random.rand(10000, 500)) - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + def time_shift_axis0(self): + self.df.shift(1, axis=0) - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) + def time_shift_axis_1(self): + self.df.shift(1, axis=1) - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) +#----------------------------------------------------------------------------- +# from_records issue-6700 -class frame_reindex_axis0(object): +class frame_from_records_generator(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) + def get_data(self, n=100000): + return ((x, (x * 20), (x * 100)) for x in range(n)) + + def time_frame_from_records_generator(self): + self.df = DataFrame.from_records(self.get_data()) + + def time_frame_from_records_generator_nrows(self): + self.df = DataFrame.from_records(self.get_data(), nrows=1000) + - def time_frame_reindex_axis0(self): - self.df.reindex(self.idx) +#----------------------------------------------------------------------------- +# duplicated -class frame_reindex_axis1(object): +class frame_duplicated(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) + self.n = (1 << 20) + self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) + self.xs = np.random.randn((self.n // 64)).round(2) + self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) - def time_frame_reindex_axis1(self): - self.df.reindex(columns=self.idx) + self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)) + def time_frame_duplicated(self): + self.df.duplicated() + + def time_frame_duplicated_wide(self): + self.df2.T.duplicated() -class frame_reindex_both_axes(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) - def time_frame_reindex_both_axes(self): - self.df.reindex(index=self.idx, columns=self.idx) -class frame_reindex_both_axes_ix(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(randn(10000, 10000)) - self.idx = np.arange(4000, 7000) - def time_frame_reindex_both_axes_ix(self): - self.df.ix[(self.idx, self.idx)] -class frame_reindex_upcast(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), 1: randint(0, 1000, 1000).astype(np.int16), 2: randint(0, 1000, 1000).astype(np.int32), 3: randint(0, 1000, 1000).astype(np.int64), }[randint(0, 4)]) for c in range(1000)])) - def time_frame_reindex_upcast(self): - self.df.reindex(permutation(range(1200))) -class frame_repr_tall(object): - goal_time = 0.2 - def setup(self): - self.df = pandas.DataFrame(np.random.randn(10000, 10)) - def time_frame_repr_tall(self): - repr(self.df) -class frame_repr_wide(object): +class frame_xs_col(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10, 10000)) + self.df = DataFrame(randn(1, 100000)) - def time_frame_repr_wide(self): - repr(self.df) + def time_frame_xs_col(self): + self.df.xs(50000, axis=1) -class frame_shift_axis0(object): +class frame_xs_row(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.rand(10000, 500)) + self.df = DataFrame(randn(100000, 1)) - def time_frame_shift_axis0(self): - self.df.shift(1, axis=0) + def time_frame_xs_row(self): + self.df.xs(50000) -class frame_shift_axis_1(object): +class frame_sort_index(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.rand(10000, 500)) + self.df = DataFrame(randn(1000000, 2), columns=list('AB')) + + def time_frame_sort_index(self): + self.df.sort_index() - def time_frame_shift_axis_1(self): - self.df.shift(1, axis=1) -class frame_to_html_mixed(object): + +class frame_quantile_axis1(object): goal_time = 0.2 def setup(self): - self.nrows = 500 - self.df = DataFrame(randn(self.nrows, 10)) - self.df[0] = period_range('2000', '2010', self.nrows) - self.df[1] = range(self.nrows) + self.df = DataFrame(np.random.randn(1000, 3), + columns=list('ABC')) + + def time_frame_quantile_axis1(self): + self.df.quantile([0.1, 0.5], axis=1) - def time_frame_to_html_mixed(self): - self.df.to_html() +#---------------------------------------------------------------------- +# boolean indexing -class frame_to_string_floats(object): +class frame_boolean_row_select(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(100, 10)) - - def time_frame_to_string_floats(self): - self.df.to_string() + self.df = DataFrame(randn(10000, 100)) + self.bool_arr = np.zeros(10000, dtype=bool) + self.bool_arr[:1000] = True + def time_frame_boolean_row_select(self): + self.df[self.bool_arr] -class frame_xs_col(object): +class frame_getitem_single_column(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(1, 100000)) + self.df = DataFrame(randn(10000, 1000)) + self.df2 = DataFrame(randn(3000, 1), columns=['A']) + self.df3 = DataFrame(randn(3000, 1)) - def time_frame_xs_col(self): - self.df.xs(50000, axis=1) + def h(self): + for i in range(10000): + self.df2['A'] + def j(self): + for i in range(10000): + self.df3[0] -class frame_xs_row(object): + def time_frame_getitem_single_column(self): + self.h() + + def time_frame_getitem_single_column2(self): + self.j() + + +#---------------------------------------------------------------------- +# assignment + +class frame_assign_timeseries_index(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(100000, 1)) + self.idx = date_range('1/1/2000', periods=100000, freq='D') + self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - def time_frame_xs_row(self): - self.df.xs(50000) + def time_frame_assign_timeseries_index(self): + self.f(self.df) + def f(self, df): + self.x = self.df.copy() + self.x['date'] = self.x.index -class frame_sort_index(object): + + +# insert many columns + +class frame_insert_100_columns_begin(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(1000000, 2), columns=list('AB')) + self.N = 1000 - def time_frame_sort_index(self): - self.df.sort_index() + def f(self, K=100): + self.df = DataFrame(index=range(self.N)) + self.new_col = np.random.randn(self.N) + for i in range(K): + self.df.insert(0, i, self.new_col) + + def g(self, K=500): + self.df = DataFrame(index=range(self.N)) + self.new_col = np.random.randn(self.N) + for i in range(K): + self.df[i] = self.new_col + + def time_frame_insert_100_columns_begin(self): + self.f() + + def time_frame_insert_500_columns_end(self): + self.g() + +#---------------------------------------------------------------------- +# strings methods, #2602 + class series_string_vector_slice(object): goal_time = 0.2 @@ -1003,15 +588,17 @@ def time_series_string_vector_slice(self): self.s.str[:5] -class frame_quantile_axis1(object): +#---------------------------------------------------------------------- +# df.info() and get_dtype_counts() # 2807 + +class frame_get_dtype_counts(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) + self.df = DataFrame(np.random.randn(10, 10000)) - def time_frame_quantile_axis1(self): - self.df.quantile([0.1, 0.5], axis=1) + def time_frame_get_dtype_counts(self): + self.df.get_dtype_counts() class frame_nlargest(object): From 16104d21c8854a56ee7860844c15ba26f832d866 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 24 Oct 2016 16:13:56 +0200 Subject: [PATCH 14/20] Clean-up gil bencmarks --- asv_bench/benchmarks/gil.py | 173 +++++++++++++----------------------- 1 file changed, 64 insertions(+), 109 deletions(-) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 1c82560c7e630..3f53894364cd2 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -22,7 +22,7 @@ def wrapper(fname): return wrapper -class nogil_groupby_base(object): +class NoGilGroupby(object): goal_time = 0.2 def setup(self): @@ -30,167 +30,122 @@ def setup(self): self.ngroups = 1000 np.random.seed(1234) self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError - -class nogil_groupby_count_2(nogil_groupby_base): + np.random.seed(1234) + self.size = 2 ** 22 + self.ngroups = 100 + self.data = Series(np.random.randint(0, self.ngroups, size=self.size)) - def time_nogil_groupby_count_2(self): - self.pg2() + if (not have_real_test_parallel): + raise NotImplementedError @test_parallel(num_threads=2) - def pg2(self): + def _pg2_count(self): self.df.groupby('key')['data'].count() - -class nogil_groupby_last_2(nogil_groupby_base): - - def time_nogil_groupby_last_2(self): - self.pg2() + def time_count_2(self): + self._pg2_count() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_last(self): self.df.groupby('key')['data'].last() - -class nogil_groupby_max_2(nogil_groupby_base): - - def time_nogil_groupby_max_2(self): - self.pg2() + def time_last_2(self): + self._pg2_last() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_max(self): self.df.groupby('key')['data'].max() - -class nogil_groupby_mean_2(nogil_groupby_base): - - def time_nogil_groupby_mean_2(self): - self.pg2() + def time_max_2(self): + self._pg2_max() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_mean(self): self.df.groupby('key')['data'].mean() - -class nogil_groupby_min_2(nogil_groupby_base): - - def time_nogil_groupby_min_2(self): - self.pg2() + def time_mean_2(self): + self._pg2_mean() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_min(self): self.df.groupby('key')['data'].min() - -class nogil_groupby_prod_2(nogil_groupby_base): - - def time_nogil_groupby_prod_2(self): - self.pg2() + def time_min_2(self): + self._pg2_min() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_prod(self): self.df.groupby('key')['data'].prod() - -class nogil_groupby_sum_2(nogil_groupby_base): - - def time_nogil_groupby_sum_2(self): - self.pg2() + def time_prod_2(self): + self._pg2_prod() @test_parallel(num_threads=2) - def pg2(self): - self.df.groupby('key')['data'].sum() - - -class nogil_groupby_sum_4(nogil_groupby_base): - - def time_nogil_groupby_sum_4(self): - self.pg4() - - def f(self): + def _pg2_sum(self): self.df.groupby('key')['data'].sum() - def g4(self): - for i in range(4): - self.f() + def time_sum_2(self): + self._pg2_sum() @test_parallel(num_threads=4) - def pg4(self): - self.f() - + def _pg4_sum(self): + self.df.groupby('key')['data'].sum() -class nogil_groupby_sum_8(nogil_groupby_base): + def time_sum_4(self): + self._pg4_sum() - def time_nogil_groupby_sum_8(self): - self.pg8() + def time_sum_4_notp(self): + for i in range(4): + self.df.groupby('key')['data'].sum() - def f(self): + def _f_sum(self): self.df.groupby('key')['data'].sum() - def g8(self): - for i in range(8): - self.f() - @test_parallel(num_threads=8) - def pg8(self): - self.f() - + def _pg8_sum(self): + self._f_sum() -class nogil_groupby_var_2(nogil_groupby_base): + def time_sum_8(self): + self._pg8_sum() - def time_nogil_groupby_var_2(self): - self.pg2() + def time_sum_8_notp(self): + for i in range(8): + self._f_sum() @test_parallel(num_threads=2) - def pg2(self): + def _pg2_var(self): self.df.groupby('key')['data'].var() + def time_var_2(self): + self._pg2_var() -class nogil_groupby_groups(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.size = 2**22 - self.ngroups = 100 - self.data = Series(np.random.randint(0, self.ngroups, size=self.size)) - if (not have_real_test_parallel): - raise NotImplementedError + # get groups - def f(self): + def _groups(self): self.data.groupby(self.data).groups - -class nogil_groupby_groups_2(nogil_groupby_groups): - - def time_nogil_groupby_groups(self): - self.pg2() - @test_parallel(num_threads=2) - def pg2(self): - self.f() - + def _pg2_groups(self): + self._groups() -class nogil_groupby_groups_4(nogil_groupby_groups): - - def time_nogil_groupby_groups(self): - self.pg4() + def time_groups_2(self): + self._pg2_groups() @test_parallel(num_threads=4) - def pg4(self): - self.f() + def _pg4_groups(self): + self._groups() + def time_groups_4(self): + self._pg4_groups() -class nogil_groupby_groups_8(nogil_groupby_groups): + @test_parallel(num_threads=8) + def _pg8_groups(self): + self._groups() - def time_nogil_groupby_groups(self): - self.pg8() + def time_groups_8(self): + self._pg8_groups() - @test_parallel(num_threads=8) - def pg8(self): - self.f() class nogil_take1d_float64(object): @@ -408,19 +363,19 @@ def create_cols(self, name): def pg_read_csv(self): read_csv('__test__.csv', sep=',', header=None, float_precision=None) - def time_nogil_read_csv(self): + def time_read_csv(self): self.pg_read_csv() @test_parallel(num_threads=2) def pg_read_csv_object(self): read_csv('__test_object__.csv', sep=',') - def time_nogil_read_csv_object(self): + def time_read_csv_object(self): self.pg_read_csv_object() @test_parallel(num_threads=2) def pg_read_csv_datetime(self): read_csv('__test_datetime__.csv', sep=',', header=None) - def time_nogil_read_csv_datetime(self): + def time_read_csv_datetime(self): self.pg_read_csv_datetime() From f56e26aa0e9dcb42b93fd3f8926b88b97d731bde Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 26 Oct 2016 00:11:05 +0200 Subject: [PATCH 15/20] Clean-up indexing benchmarks --- asv_bench/benchmarks/binary_ops.py | 3 + asv_bench/benchmarks/index_object.py | 334 ++++++-------------- asv_bench/benchmarks/indexing.py | 377 +++++++---------------- asv_bench/benchmarks/pandas_vb_common.py | 4 +- 4 files changed, 215 insertions(+), 503 deletions(-) diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 938f59db37df6..53cb1cf465698 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -27,6 +27,9 @@ def time_frame_mult(self, use_numexpr, threads): def time_frame_multi_and(self, use_numexpr, threads): self.df[((self.df > 0) & (self.df2 > 0))] + def time_frame_comparison(self, use_numexpr, threads): + (self.df > self.df2) + def teardown(self, use_numexpr, threads): expr.set_use_numexpr(True) expr.set_numexpr_threads() diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 2c94f9b2b1e8c..3fb53ce9b3c98 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,102 +1,93 @@ from .pandas_vb_common import * -class datetime_index_intersection(object): +class SetOperations(object): goal_time = 0.2 def setup(self): self.rng = date_range('1/1/2000', periods=10000, freq='T') self.rng2 = self.rng[:(-1)] - def time_datetime_index_intersection(self): + # object index with datetime values + if (self.rng.dtype == object): + self.idx_rng = self.rng.view(Index) + else: + self.idx_rng = self.rng.asobject + self.idx_rng2 = self.idx_rng[:(-1)] + + # other datetime + N = 100000 + A = N - 20000 + B = N + 20000 + self.dtidx1 = DatetimeIndex(range(N)) + self.dtidx2 = DatetimeIndex(range(A, B)) + self.dtidx3 = DatetimeIndex(range(N, B)) + + # integer + self.N = 1000000 + self.options = np.arange(self.N) + self.left = Index( + self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + self.right = Index( + self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) + + # strings + N = 10000 + strs = tm.rands_array(10, N) + self.leftstr = Index(strs[:N * 2 // 3]) + self.rightstr = Index(strs[N // 3:]) + + def time_datetime_intersection(self): self.rng.intersection(self.rng2) - -class datetime_index_repr(object): - goal_time = 0.2 - - def setup(self): - self.dr = pd.date_range('20000101', freq='D', periods=100000) - - def time_datetime_index_repr(self): - self.dr._is_dates_only - - -class datetime_index_union(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=10000, freq='T') - self.rng2 = self.rng[:(-1)] - - def time_datetime_index_union(self): + def time_datetime_union(self): self.rng.union(self.rng2) + def time_datetime_difference(self): + self.dtidx1.difference(self.dtidx2) -class index_datetime_intersection(object): - goal_time = 0.2 + def time_datetime_difference_disjoint(self): + self.dtidx1.difference(self.dtidx3) - def setup(self): - self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) - if (self.rng.dtype == object): - self.rng = self.rng.view(Index) - else: - self.rng = self.rng.asobject - self.rng2 = self.rng[:(-1)] + def time_datetime_symmetric_difference(self): + self.dtidx1.symmetric_difference(self.dtidx2) def time_index_datetime_intersection(self): - self.rng.intersection(self.rng2) - - -class index_datetime_union(object): - goal_time = 0.2 - - def setup(self): - self.rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) - if (self.rng.dtype == object): - self.rng = self.rng.view(Index) - else: - self.rng = self.rng.asobject - self.rng2 = self.rng[:(-1)] + self.idx_rng.intersection(self.idx_rng2) def time_index_datetime_union(self): - self.rng.union(self.rng2) + self.idx_rng.union(self.idx_rng2) + def time_int64_intersection(self): + self.left.intersection(self.right) -class index_datetime_set_difference(object): - goal_time = 0.2 + def time_int64_union(self): + self.left.union(self.right) - def setup(self): - self.N = 100000 - self.A = self.N - 20000 - self.B = self.N + 20000 - self.idx1 = DatetimeIndex(range(self.N)) - self.idx2 = DatetimeIndex(range(self.A, self.B)) - self.idx3 = DatetimeIndex(range(self.N, self.B)) + def time_int64_difference(self): + self.left.difference(self.right) - def time_index_datetime_difference(self): - self.idx1.difference(self.idx2) + def time_int64_symmetric_difference(self): + self.left.symmetric_difference(self.right) - def time_index_datetime_difference_disjoint(self): - self.idx1.difference(self.idx3) + def time_str_difference(self): + self.leftstr.difference(self.rightstr) - def time_index_datetime_symmetric_difference(self): - self.idx1.symmetric_difference(self.idx2) + def time_str_symmetric_difference(self): + self.leftstr.symmetric_difference(self.rightstr) -class index_float64_boolean_indexer(object): +class Datetime(object): goal_time = 0.2 def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + self.dr = pd.date_range('20000101', freq='D', periods=10000) - def time_index_float64_boolean_indexer(self): - self.idx[self.mask] + def time_is_dates_only(self): + self.dr._is_dates_only -class index_float64_boolean_series_indexer(object): +class Float64(object): goal_time = 0.2 def setup(self): @@ -104,141 +95,34 @@ def setup(self): self.mask = ((np.arange(self.idx.size) % 3) == 0) self.series_mask = Series(self.mask) - def time_index_float64_boolean_series_indexer(self): - self.idx[self.series_mask] - - -class index_float64_construct(object): - goal_time = 0.2 - - def setup(self): self.baseidx = np.arange(1000000.0) - def time_index_float64_construct(self): - Index(self.baseidx) - + def time_boolean_indexer(self): + self.idx[self.mask] -class index_float64_div(object): - goal_time = 0.2 + def time_boolean_series_indexer(self): + self.idx[self.series_mask] - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + def time_construct(self): + Index(self.baseidx) - def time_index_float64_div(self): + def time_div(self): (self.idx / 2) - -class index_float64_get(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_get(self): + def time_get(self): self.idx[1] - -class index_float64_mul(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_mul(self): + def time_mul(self): (self.idx * 2) - -class index_float64_slice_indexer_basic(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_slice_indexer_basic(self): + def time_slice_indexer_basic(self): self.idx[:(-1)] - -class index_float64_slice_indexer_even(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_float64_slice_indexer_even(self): + def time_slice_indexer_even(self): self.idx[::2] -class index_int64_intersection(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.options = np.arange(self.N) - self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - - def time_index_int64_intersection(self): - self.left.intersection(self.right) - - -class index_int64_union(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.options = np.arange(self.N) - self.left = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index(self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - - def time_index_int64_union(self): - self.left.union(self.right) - - -class index_int64_set_difference(object): - goal_time = 0.2 - - def setup(self): - self.N = 500000 - self.options = np.arange(self.N) - self.left = Index(self.options.take( - np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index(self.options.take( - np.random.permutation(self.N)[:(self.N // 2)])) - - def time_index_int64_difference(self): - self.left.difference(self.right) - - def time_index_int64_symmetric_difference(self): - self.left.symmetric_difference(self.right) - - -class index_str_set_difference(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.strs = tm.rands_array(10, self.N) - self.left = Index(self.strs[:self.N * 2 // 3]) - self.right = Index(self.strs[self.N // 3:]) - - def time_str_difference(self): - self.left.difference(self.right) - - def time_str_symmetric_difference(self): - self.left.symmetric_difference(self.right) - - -class index_str_boolean_indexer(object): +class StringIndex(object): goal_time = 0.2 def setup(self): @@ -246,47 +130,20 @@ def setup(self): self.mask = ((np.arange(1000000) % 3) == 0) self.series_mask = Series(self.mask) - def time_index_str_boolean_indexer(self): + def time_boolean_indexer(self): self.idx[self.mask] - -class index_str_boolean_series_indexer(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_str_boolean_series_indexer(self): + def time_boolean_series_indexer(self): self.idx[self.series_mask] - -class index_str_slice_indexer_basic(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_str_slice_indexer_basic(self): + def time_slice_indexer_basic(self): self.idx[:(-1)] - -class index_str_slice_indexer_even(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_index_str_slice_indexer_even(self): + def time_slice_indexer_even(self): self.idx[::2] -class multiindex_duplicated(object): +class Multi1(object): goal_time = 0.2 def setup(self): @@ -295,21 +152,16 @@ def setup(self): self.labels = [np.random.choice(n, (k * n)) for lev in self.levels] self.mi = MultiIndex(levels=self.levels, labels=self.labels) - def time_multiindex_duplicated(self): - self.mi.duplicated() - - -class multiindex_from_product(object): - goal_time = 0.2 - - def setup(self): self.iterables = [tm.makeStringIndex(10000), range(20)] - def time_multiindex_from_product(self): + def time_duplicated(self): + self.mi.duplicated() + + def time_from_product(self): MultiIndex.from_product(self.iterables) -class multiindex_sortlevel_int64(object): +class Multi2(object): goal_time = 0.2 def setup(self): @@ -319,23 +171,22 @@ def setup(self): self.i = np.random.permutation(self.n) self.mi = MultiIndex.from_arrays([self.f(11), self.f(7), self.f(5), self.f(3), self.f(1)])[self.i] - def time_multiindex_sortlevel_int64(self): + self.a = np.repeat(np.arange(100), 1000) + self.b = np.tile(np.arange(1000), 100) + self.midx2 = MultiIndex.from_arrays([self.a, self.b]) + self.midx2 = self.midx2.take(np.random.permutation(np.arange(100000))) + + def time_sortlevel_int64(self): self.mi.sortlevel() + def time_sortlevel_zero(self): + self.midx2.sortlevel(0) -class multiindex_with_datetime_level_full(object): - goal_time = 0.2 + def time_sortlevel_one(self): + self.midx2.sortlevel(1) - def setup(self): - self.level1 = range(1000) - self.level2 = date_range(start='1/1/2012', periods=100) - self.mi = MultiIndex.from_product([self.level1, self.level2]) - def time_multiindex_with_datetime_level_full(self): - self.mi.copy().values - - -class multiindex_with_datetime_level_sliced(object): +class Multi3(object): goal_time = 0.2 def setup(self): @@ -343,5 +194,8 @@ def setup(self): self.level2 = date_range(start='1/1/2012', periods=100) self.mi = MultiIndex.from_product([self.level1, self.level2]) - def time_multiindex_with_datetime_level_sliced(self): + def time_datetime_level_values_full(self): + self.mi.copy().values + + def time_datetime_level_values_sliced(self): self.mi[:10].values diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 7704b9c6571b9..27cd320c661e0 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -5,184 +5,171 @@ expr = None -class dataframe_getitem_scalar(object): +class Int64Indexing(object): goal_time = 0.2 def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.rand(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - def time_dataframe_getitem_scalar(self): - self.df[self.col][self.idx] - + self.s = Series(np.random.rand(1000000)) -class series_get_value(object): - goal_time = 0.2 + def time_getitem_scalar(self): + self.s[800000] - def setup(self): - self.index = tm.makeStringIndex(1000) - self.s = Series(np.random.rand(1000), index=self.index) - self.idx = self.index[100] + def time_getitem_slice(self): + self.s[:800000] - def time_series_get_value(self): - self.s.get_value(self.idx) + def time_getitem_list_like(self): + self.s[[800000]] + def time_getitem_array(self): + self.s[np.arange(10000)] -class time_series_getitem_scalar(object): - goal_time = 0.2 + def time_iloc_array(self): + self.s.iloc[np.arange(10000)] - def setup(self): - tm.N = 1000 - self.ts = tm.makeTimeSeries() - self.dt = self.ts.index[500] + def time_iloc_list_like(self): + self.s.iloc[[800000]] - def time_time_series_getitem_scalar(self): - self.ts[self.dt] + def time_iloc_scalar(self): + self.s.iloc[800000] + def time_iloc_slice(self): + self.s.iloc[:800000] -class frame_iloc_big(object): - goal_time = 0.2 + def time_ix_array(self): + self.s.ix[np.arange(10000)] - def setup(self): - self.df = DataFrame(dict(A=(['foo'] * 1000000))) + def time_ix_list_like(self): + self.s.ix[[800000]] - def time_frame_iloc_big(self): - self.df.iloc[:100, 0] + def time_ix_scalar(self): + self.s.ix[800000] + def time_ix_slice(self): + self.s.ix[:800000] -class frame_iloc_dups(object): - goal_time = 0.2 + def time_loc_array(self): + self.s.loc[np.arange(10000)] - def setup(self): - self.df = DataFrame({'A': ([0.1] * 3000), 'B': ([1] * 3000), }) - self.idx = (np.array(range(30)) * 99) - self.df2 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000), }) - self.df2 = concat([self.df2, (2 * self.df2), (3 * self.df2)]) + def time_loc_list_like(self): + self.s.loc[[800000]] - def time_frame_iloc_dups(self): - self.df2.iloc[self.idx] + def time_loc_scalar(self): + self.s.loc[800000] - def time_frame_loc_dups(self): - self.df2.loc[self.idx] + def time_loc_slice(self): + self.s.loc[:800000] -class frame_xs_mi_ix(object): +class StringIndexing(object): goal_time = 0.2 def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) - self.s = Series(np.random.randn(1000000), index=self.mi) - self.df = DataFrame(self.s) - - def time_frame_xs_mi_ix(self): - self.df.ix[999] - - -class indexing_dataframe_boolean(object): - goal_time = 0.2 + self.index = tm.makeStringIndex(1000000) + self.s = Series(np.random.rand(1000000), index=self.index) + self.lbl = self.s.index[800000] - def setup(self): - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) + def time_getitem_label_slice(self): + self.s[:self.lbl] - def time_indexing_dataframe_boolean(self): - (self.df > self.df2) + def time_getitem_pos_slice(self): + self.s[:800000] + def time_get_value(self): + self.s.get_value(self.lbl) -class indexing_dataframe_boolean_no_ne(object): - goal_time = 0.2 - params = [True, False] - param_names = ['use_numexpr'] - - def setup(self, use_numexpr): - if (expr is None): - raise NotImplementedError - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) - if use_numexpr: - expr.set_numexpr_threads(1) - else: - expr.set_use_numexpr(False) - - def time_indexing_dataframe_boolean_no_ne(self, use_numexpr): - (self.df > self.df2) - - def teardown(self, use_numexpr): - if use_numexpr: - expr.set_numexpr_threads() - else: - expr.set_use_numexpr(True) - -class indexing_dataframe_boolean_st(object): +class DatetimeIndexing(object): goal_time = 0.2 def setup(self): - if (expr is None): - raise NotImplementedError - self.df = DataFrame(np.random.randn(50000, 100)) - self.df2 = DataFrame(np.random.randn(50000, 100)) - expr.set_numexpr_threads(1) - - def time_indexing_dataframe_boolean_st(self): - (self.df > self.df2) - - def teardown(self): - expr.set_numexpr_threads() + tm.N = 1000 + self.ts = tm.makeTimeSeries() + self.dt = self.ts.index[500] + def time_getitem_scalar(self): + self.ts[self.dt] + -class indexing_dataframe_boolean_rows(object): +class DataFrameIndexing(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) - self.indexer = (self.df['B'] > 0) + self.index = tm.makeStringIndex(1000) + self.columns = tm.makeStringIndex(30) + self.df = DataFrame(np.random.randn(1000, 30), index=self.index, + columns=self.columns) + self.idx = self.index[100] + self.col = self.columns[10] + + self.df2 = DataFrame(np.random.randn(10000, 4), + columns=['A', 'B', 'C', 'D']) + self.indexer = (self.df2['B'] > 0) self.obj_indexer = self.indexer.astype('O') - def time_indexing_dataframe_boolean_rows(self): - self.df[self.indexer] + # duptes + self.idx_dupe = (np.array(range(30)) * 99) + self.df3 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000),}) + self.df3 = concat([self.df3, (2 * self.df3), (3 * self.df3)]) - def time_indexing_dataframe_boolean_rows_object(self): - self.df[self.obj_indexer] + self.df_big = DataFrame(dict(A=(['foo'] * 1000000))) + def time_get_value(self): + self.df.get_value(self.idx, self.col) + def time_get_value_ix(self): + self.df.ix[(self.idx, self.col)] + def time_getitem_scalar(self): + self.df[self.col][self.idx] -class indexing_frame_get_value(object): - goal_time = 0.2 + def time_boolean_rows(self): + self.df2[self.indexer] - def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] + def time_boolean_rows_object(self): + self.df2[self.obj_indexer] - def time_indexing_frame_get_value(self): - self.df.get_value(self.idx, self.col) + def time_iloc_dups(self): + self.df3.iloc[self.idx_dupe] - def time_indexing_frame_get_value_ix(self): - self.df.ix[(self.idx, self.col)] + def time_loc_dups(self): + self.df3.loc[self.idx_dupe] + + def time_iloc_big(self): + self.df_big.iloc[:100, 0] -class indexing_panel_subset(object): +class IndexingMethods(object): + # GH 13166 goal_time = 0.2 def setup(self): - self.p = Panel(np.random.randn(100, 100, 100)) - self.inds = range(0, 100, 10) + a = np.arange(100000) + self.ind = pd.Float64Index(a * 4.8000000418824129e-08) - def time_indexing_panel_subset(self): - self.p.ix[(self.inds, self.inds, self.inds)] + self.s = Series(np.random.rand(100000)) + self.ts = Series(np.random.rand(100000), + index=date_range('2011-01-01', freq='S', periods=100000)) + self.indexer = ([True, False, True, True, False] * 20000) + def time_get_loc_float(self): + self.ind.get_loc(0) + + def time_take_dtindex(self): + self.ts.take(self.indexer) -class multiindex_slicers(object): + def time_take_intindex(self): + self.s.take(self.indexer) + + +class MultiIndexing(object): goal_time = 0.2 def setup(self): + self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) + self.s = Series(np.random.randn(1000000), index=self.mi) + self.df = DataFrame(self.s) + + # slicers np.random.seed(1234) self.idx = pd.IndexSlice self.n = 100000 @@ -203,154 +190,22 @@ def setup(self): self.eps_D = 5000 self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() - def time_multiindex_slicers(self): - self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] - - -class series_getitem_array(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_array(self): - self.s[np.arange(10000)] - - def time_series_getitem_list_like(self): - self.s[[800000]] - - -class series_getitem_label_slice(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(1000000) - self.s = Series(np.random.rand(1000000), index=self.index) - self.lbl = self.s.index[800000] - - def time_series_getitem_label_slice(self): - self.s[:self.lbl] - - def time_series_getitem_pos_slice(self): - self.s[:800000] - - -class SeriesIndexing(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(1000000)) - - def time_series_getitem_scalar(self): - self.s[800000] - - def time_series_getitem_slice(self): - self.s[:800000] - - def time_series_iloc_array(self): - self.s.iloc[np.arange(10000)] - - def time_series_iloc_list_like(self): - self.s.iloc[[800000]] - - def time_series_iloc_scalar(self): - self.s.iloc[800000] - - def time_series_iloc_slice(self): - self.s.iloc[:800000] - - def time_series_ix_array(self): - self.s.ix[np.arange(10000)] - - def time_series_ix_list_like(self): - self.s.ix[[800000]] - - def time_series_ix_scalar(self): - self.s.ix[800000] - - def time_series_ix_slice(self): - self.s.ix[:800000] - - def time_series_loc_array(self): - self.s.loc[np.arange(10000)] - - def time_series_loc_list_like(self): - self.s.loc[[800000]] - - def time_series_loc_scalar(self): - self.s.loc[800000] - - def time_series_loc_slice(self): - self.s.loc[:800000] - - -class series_take_dtindex(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(100000)) - self.ts = Series(np.random.rand(100000), index=date_range('2011-01-01', freq='S', periods=100000)) - self.indexer = ([True, False, True, True, False] * 20000) - - def time_series_take_dtindex(self): - self.ts.take(self.indexer) - - -class series_take_intindex(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.rand(100000)) - self.ts = Series(np.random.rand(100000), index=date_range('2011-01-01', freq='S', periods=100000)) - self.indexer = ([True, False, True, True, False] * 20000) - - def time_series_take_intindex(self): - self.s.take(self.indexer) - - -class series_xs_mi_ix(object): - goal_time = 0.2 - - def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) - self.s = Series(np.random.randn(1000000), index=self.mi) - def time_series_xs_mi_ix(self): self.s.ix[999] + def time_frame_xs_mi_ix(self): + self.df.ix[999] -class sort_level_one(object): - goal_time = 0.2 - - def setup(self): - self.a = np.repeat(np.arange(100), 1000) - self.b = np.tile(np.arange(1000), 100) - self.midx = MultiIndex.from_arrays([self.a, self.b]) - self.midx = self.midx.take(np.random.permutation(np.arange(100000))) - - def time_sort_level_one(self): - self.midx.sortlevel(1) - - -class sort_level_zero(object): - goal_time = 0.2 - - def setup(self): - self.a = np.repeat(np.arange(100), 1000) - self.b = np.tile(np.arange(1000), 100) - self.midx = MultiIndex.from_arrays([self.a, self.b]) - self.midx = self.midx.take(np.random.permutation(np.arange(100000))) + def time_multiindex_slicers(self): + self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] - def time_sort_level_zero(self): - self.midx.sortlevel(0) -class float_loc(object): - # GH 13166 +class PanelIndexing(object): goal_time = 0.2 def setup(self): - a = np.arange(100000) - self.ind = pd.Float64Index(a * 4.8000000418824129e-08) + self.p = Panel(np.random.randn(100, 100, 100)) + self.inds = range(0, 100, 10) - def time_float_loc(self): - self.ind.get_loc(0) + def time_subset(self): + self.p.ix[(self.inds, self.inds, self.inds)] diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 3370131929c22..25b0b5dd4d1b0 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -20,9 +20,9 @@ import pandas.lib as lib try: - Panel = WidePanel + Panel = Panel except Exception: - pass + Panel = WidePanel # didn't add to namespace until later try: From 1baf3fc684f15417b25fa8a355bdedaa0c49414e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 26 Oct 2016 11:28:35 +0200 Subject: [PATCH 16/20] Clean-up inference/miscellaneous benchmarks --- asv_bench/benchmarks/algorithms.py | 9 +- asv_bench/benchmarks/attrs_caching.py | 17 +++ asv_bench/benchmarks/inference.py | 161 ++++++++------------------ asv_bench/benchmarks/miscellaneous.py | 52 --------- 4 files changed, 72 insertions(+), 167 deletions(-) delete mode 100644 asv_bench/benchmarks/miscellaneous.py diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7f427b7a2e4fc..c4a6117c0704a 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -24,6 +24,10 @@ def setup(self): self.arrneg = np.arange(-1000000, 0) self.arrmixed = np.array([1, -1]).repeat(500000) + # match + self.uniques = tm.makeStringIndex(1000).values + self.all = self.uniques.repeat(10) + def time_factorize_int(self): self.int.factorize() @@ -39,6 +43,9 @@ def time_duplicated_int(self): def time_duplicated_float(self): self.float.duplicated() + def time_match_strings(self): + pd.match(self.all, self.uniques) + def time_add_overflow_pos_scalar(self): self.checked_add(self.arr, 1) @@ -58,7 +65,7 @@ def time_add_overflow_mixed_arr(self): self.checked_add(self.arr, self.arrmixed) -class hashing(object): +class Hashing(object): goal_time = 0.2 def setup(self): diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index 1f41d24fd4cbc..9210f1f2878d4 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,4 +1,5 @@ from .pandas_vb_common import * +from pandas.util.decorators import cache_readonly class DataFrameAttributes(object): @@ -13,3 +14,19 @@ def time_get_index(self): def time_set_index(self): self.df.index = self.cur_index + + +class CacheReadonly(object): + goal_time = 0.2 + + def setup(self): + + class Foo: + + @cache_readonly + def prop(self): + return 5 + self.obj = Foo() + + def time_cache_readonly(self): + self.obj.prop diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 136d5fc12d3e8..6eda93c0a1dc8 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -2,143 +2,76 @@ import pandas as pd -class dtype_infer_datetime64(object): +class DtypeInfer(object): goal_time = 0.2 - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_datetime64(self): - (self.df_datetime64['A'] - self.df_datetime64['B']) - - -class dtype_infer_float32(object): - goal_time = 0.2 + # from GH 7332 def setup(self): self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_float32(self): - (self.df_float32['A'] + self.df_float32['B']) + self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), + B=np.arange(self.N, dtype='int64'))) + self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), + B=np.arange(self.N, dtype='int32'))) + self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), + B=np.arange(self.N, dtype='uint32'))) + self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), + B=np.arange(self.N, dtype='float64'))) + self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), + B=np.arange(self.N, dtype='float32'))) + self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), + B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) + self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), + B=self.df_datetime64['B'])) + + def time_int64(self): + (self.df_int64['A'] + self.df_int64['B']) + def time_int32(self): + (self.df_int32['A'] + self.df_int32['B']) -class dtype_infer_float64(object): - goal_time = 0.2 + def time_uint32(self): + (self.df_uint32['A'] + self.df_uint32['B']) - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_float64(self): + def time_float64(self): (self.df_float64['A'] + self.df_float64['B']) + def time_float32(self): + (self.df_float32['A'] + self.df_float32['B']) -class dtype_infer_int32(object): - goal_time = 0.2 - - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_int32(self): - (self.df_int32['A'] + self.df_int32['B']) - + def time_datetime64(self): + (self.df_datetime64['A'] - self.df_datetime64['B']) -class dtype_infer_int64(object): - goal_time = 0.2 + def time_timedelta64_1(self): + (self.df_timedelta64['A'] + self.df_timedelta64['B']) - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_int64(self): - (self.df_int64['A'] + self.df_int64['B']) + def time_timedelta64_2(self): + (self.df_timedelta64['A'] + self.df_timedelta64['A']) -class dtype_infer_timedelta64_1(object): +class to_numeric(object): goal_time = 0.2 def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_timedelta64_1(self): - (self.df_timedelta64['A'] + self.df_timedelta64['B']) - + self.n = 10000 + self.float = Series(np.random.randn(self.n * 100)) + self.numstr = self.float.astype('str') + self.str = Series(tm.makeStringIndex(self.n)) -class dtype_infer_timedelta64_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_timedelta64_2(self): - (self.df_timedelta64['A'] + self.df_timedelta64['A']) + def time_from_float(self): + pd.to_numeric(self.float) + def time_from_numeric_str(self): + pd.to_numeric(self.numstr) -class dtype_infer_uint32(object): - goal_time = 0.2 + def time_from_str_ignore(self): + pd.to_numeric(self.str, errors='ignore') - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) - - def time_dtype_infer_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) + def time_from_str_coerce(self): + pd.to_numeric(self.str, errors='coerce') -class to_numeric(object): +class to_numeric_downcast(object): param_names = ['dtype', 'downcast'] params = [['string-float', 'string-int', 'string-nint', 'datetime64', @@ -162,4 +95,4 @@ def setup(self, dtype, downcast): self.data = self.data_dict[dtype] def time_downcast(self, dtype, downcast): - pd.to_numeric(self.data, downcast=downcast) + pd.to_numeric(self.data, downcast=downcast) \ No newline at end of file diff --git a/asv_bench/benchmarks/miscellaneous.py b/asv_bench/benchmarks/miscellaneous.py deleted file mode 100644 index f9d577a2b56d7..0000000000000 --- a/asv_bench/benchmarks/miscellaneous.py +++ /dev/null @@ -1,52 +0,0 @@ -from .pandas_vb_common import * -from pandas.util.decorators import cache_readonly - - -class match_strings(object): - goal_time = 0.2 - - def setup(self): - self.uniques = tm.makeStringIndex(1000).values - self.all = self.uniques.repeat(10) - - def time_match_strings(self): - match(self.all, self.uniques) - - -class misc_cache_readonly(object): - goal_time = 0.2 - - def setup(self): - - - class Foo: - - @cache_readonly - def prop(self): - return 5 - self.obj = Foo() - - def time_misc_cache_readonly(self): - self.obj.prop - - -class to_numeric(object): - goal_time = 0.2 - - def setup(self): - self.n = 10000 - self.float = Series(np.random.randn(self.n * 100)) - self.numstr = self.float.astype('str') - self.str = Series(tm.makeStringIndex(self.n)) - - def time_from_float(self): - pd.to_numeric(self.float) - - def time_from_numeric_str(self): - pd.to_numeric(self.numstr) - - def time_from_str_ignore(self): - pd.to_numeric(self.str, errors='ignore') - - def time_from_str_coerce(self): - pd.to_numeric(self.str, errors='coerce') From d54f18cf1b8335a9f778e8a1b5cccfc10ae8ef3b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 26 Oct 2016 11:56:54 +0200 Subject: [PATCH 17/20] Clean-up timedelta/period/plotting benchmarks --- asv_bench/benchmarks/period.py | 40 ++++++++++++++++--------------- asv_bench/benchmarks/plotting.py | 6 ++--- asv_bench/benchmarks/timedelta.py | 24 +++++++------------ 3 files changed, 33 insertions(+), 37 deletions(-) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 75b2c2dcacfed..ff5a201057bcd 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,31 +1,33 @@ +import pandas as pd from pandas import Series, Period, PeriodIndex, date_range -class create_period_index_from_date_range(object): +class Constructor(object): goal_time = 0.2 - def time_period_index(self): - # Simulate irregular PeriodIndex - PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D') + def setup(self): + self.rng = date_range('1985', periods=1000) + self.rng2 = date_range('1985', periods=1000).to_pydatetime() + + def time_from_date_range(self): + PeriodIndex(self.rng, freq='D') + def time_from_pydatetime(self): + PeriodIndex(self.rng2, freq='D') -class period_setitem(object): + +class DataFrame(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = period_range(start='1/1/1990', freq='S', periods=20000) - self.df = DataFrame(index=range(len(self.rng))) - - def time_period_setitem(self): + self.rng = pd.period_range(start='1/1/1990', freq='S', periods=20000) + self.df = pd.DataFrame(index=range(len(self.rng))) + + def time_setitem_period_column(self): self.df['col'] = self.rng -class period_algorithm(object): +class Algorithms(object): goal_time = 0.2 def setup(self): @@ -34,16 +36,16 @@ def setup(self): self.s = Series(data * 1000) self.i = PeriodIndex(data, freq='M') - def time_period_series_drop_duplicates(self): + def time_drop_duplicates_pseries(self): self.s.drop_duplicates() - def time_period_index_drop_duplicates(self): + def time_drop_duplicates_pindex(self): self.i.drop_duplicates() - def time_period_series_value_counts(self): + def time_value_counts_pseries(self): self.s.value_counts() - def time_period_index_value_counts(self): + def time_value_counts_pindex(self): self.i.value_counts() diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 7a4a98e2195c2..3350ddaccc496 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -7,7 +7,7 @@ def date_range(start=None, end=None, periods=None, freq=None): from pandas.tools.plotting import andrews_curves -class plot_timeseries_period(object): +class TimeseriesPlotting(object): goal_time = 0.2 def setup(self): @@ -17,11 +17,11 @@ def setup(self): self.M = 5 self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N)) - def time_plot_timeseries_period(self): + def time_plot_regular(self): self.df.plot() -class plot_andrews_curves(object): +class Misc(object): goal_time = 0.6 def setup(self): diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 78b6aab9fb5c8..c112d1ef72eb8 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -2,7 +2,7 @@ from pandas import to_timedelta, Timestamp -class Timedelta(object): +class ToTimedelta(object): goal_time = 0.2 def setup(self): @@ -12,6 +12,9 @@ def setup(self): self.arr3 = np.random.randint(0, 60, size=10000) self.arr3 = ['00:00:{0:02d}'.format(i) for i in self.arr3] + self.arr4 = list(self.arr2) + self.arr4[-1] = 'apple' + def time_convert_int(self): to_timedelta(self.arr, unit='s') @@ -21,23 +24,14 @@ def time_convert_string(self): def time_convert_string_seconds(self): to_timedelta(self.arr3) + def time_convert_coerce(self): + to_timedelta(self.arr4, errors='coerce') -class timedelta_convert_bad_parse(object): - goal_time = 0.2 - - def setup(self): - self.arr = np.random.randint(0, 1000, size=10000) - self.arr = ['{0} days'.format(i) for i in self.arr] - self.arr[-1] = 'apple' - - def time_timedelta_convert_coerce(self): - to_timedelta(self.arr, errors='coerce') - - def time_timedelta_convert_ignore(self): - to_timedelta(self.arr, errors='ignore') + def time_convert_ignore(self): + to_timedelta(self.arr4, errors='ignore') -class timedelta_add_overflow(object): +class Ops(object): goal_time = 0.2 def setup(self): From 38261477c5f8b85ce21108b2de816a778e86fdf2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 28 Oct 2016 16:25:43 +0200 Subject: [PATCH 18/20] Cleanup indexing benchmarks --- asv_bench/benchmarks/frame_methods.py | 29 +- asv_bench/benchmarks/reindex.py | 423 +++++++------------------- 2 files changed, 142 insertions(+), 310 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index b0788891a3418..3daffb9d3a1cc 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -30,7 +30,7 @@ class Reindex(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 10000)) + self.df = DataFrame(randn(10000, 1000)) self.idx = np.arange(4000, 7000) self.df2 = DataFrame( @@ -210,6 +210,20 @@ def time_frame_fillna_inplace(self): self.df.fillna(0, inplace=True) + +class frame_fillna_many_columns_pad(object): + goal_time = 0.2 + + def setup(self): + self.values = np.random.randn(1000, 1000) + self.values[::2] = np.nan + self.df = DataFrame(self.values) + + def time_frame_fillna_many_columns_pad(self): + self.df.fillna(method='pad') + + + class Dropna(object): goal_time = 0.2 @@ -478,6 +492,19 @@ def time_frame_sort_index(self): self.df.sort_index() +class frame_sort_index_by_columns(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) + + def time_frame_sort_index_by_columns(self): + self.df.sort_index(by=['key1', 'key2']) class frame_quantile_axis1(object): diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index b1c039058ff8f..8db0cd7629332 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -2,175 +2,52 @@ from random import shuffle -class dataframe_reindex(object): +class Reindexing(object): goal_time = 0.2 def setup(self): - self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq=datetools.Minute()) - self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, columns=range(10)) + self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') + self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, + columns=range(10)) self.df['foo'] = 'bar' self.rng2 = Index(self.rng[::2]) - def time_dataframe_reindex(self): - self.df.reindex(self.rng2) - - -class frame_drop_dup_inplace(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_drop_dup_inplace(self): - self.df.drop_duplicates(['key1', 'key2'], inplace=True) - - -class frame_drop_dup_na_inplace(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - - def time_frame_drop_dup_na_inplace(self): - self.df.drop_duplicates(['key1', 'key2'], inplace=True) - - -class frame_drop_duplicates(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_drop_duplicates(self): - self.df.drop_duplicates(['key1', 'key2']) - - -class frame_drop_duplicates_int(object): - - def setup(self): - np.random.seed(1234) - self.N = 1000000 - self.K = 10000 - self.key1 = np.random.randint(0,self.K,size=self.N) - self.df = DataFrame({'key1': self.key1}) - - def time_frame_drop_duplicates_int(self): - self.df.drop_duplicates() - - -class frame_drop_duplicates_na(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - - def time_frame_drop_duplicates_na(self): - self.df.drop_duplicates(['key1', 'key2']) - - -class frame_fillna_many_columns_pad(object): - goal_time = 0.2 - - def setup(self): - self.values = np.random.randn(1000, 1000) - self.values[::2] = np.nan - self.df = DataFrame(self.values) - - def time_frame_fillna_many_columns_pad(self): - self.df.fillna(method='pad') - - -class frame_reindex_columns(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30)) - - def time_frame_reindex_columns(self): - self.df.reindex(columns=self.df.columns[1:5]) - + self.df2 = DataFrame(index=range(10000), + data=np.random.rand(10000, 30), columns=range(30)) -class frame_sort_index_by_columns(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - def time_frame_sort_index_by_columns(self): - self.df.sort_index(by=['key1', 'key2']) - - -class lib_fast_zip(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) + # multi-index + N = 1000 + K = 20 + level1 = tm.makeStringIndex(N).values.repeat(K) + level2 = np.tile(tm.makeStringIndex(K).values, N) + index = MultiIndex.from_arrays([level1, level2]) + self.s1 = Series(np.random.randn((N * K)), index=index) + self.s2 = self.s1[::2] - def time_lib_fast_zip(self): - lib.fast_zip(self.col_array_list) + def time_reindex_dates(self): + self.df.reindex(self.rng2) + def time_reindex_columns(self): + self.df2.reindex(columns=self.df.columns[1:5]) -class lib_fast_zip_fillna(object): - goal_time = 0.2 + def time_reindex_multiindex(self): + self.s1.reindex(self.s2.index) - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - self.df.ix[:10000, :] = np.nan - def time_lib_fast_zip_fillna(self): - lib.fast_zip_fillna(self.col_array_list) +#---------------------------------------------------------------------- +# Pad / backfill -class reindex_daterange_backfill(object): +class FillMethod(object): goal_time = 0.2 def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) + self.rng = date_range('1/1/2000', periods=100000, freq='1min') self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) self.ts2 = self.ts[::2] self.ts3 = self.ts2.reindex(self.ts.index) self.ts4 = self.ts3.astype('float32') - def time_reindex_daterange_backfill(self): - self.backfill(self.ts2, self.ts.index) - def pad(self, source_series, target_index): try: source_series.reindex(target_index, method='pad') @@ -183,215 +60,143 @@ def backfill(self, source_series, target_index): except: source_series.reindex(target_index, fillMethod='backfill') + def time_backfill_dates(self): + self.backfill(self.ts2, self.ts.index) -class reindex_daterange_pad(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def time_reindex_daterange_pad(self): + def time_pad_daterange(self): self.pad(self.ts2, self.ts.index) - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') - - -class reindex_fillna_backfill(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def time_reindex_fillna_backfill(self): + def time_backfill(self): self.ts3.fillna(method='backfill') - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') - - -class reindex_fillna_backfill_float32(object): - goal_time = 0.2 + def time_backfill_float32(self): + self.ts4.fillna(method='backfill') - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') + def time_pad(self): + self.ts3.fillna(method='pad') - def time_reindex_fillna_backfill_float32(self): - self.ts4.fillna(method='backfill') + def time_pad_float32(self): + self.ts4.fillna(method='pad') - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') +#---------------------------------------------------------------------- +# align on level -class reindex_fillna_pad(object): +class LevelAlign(object): goal_time = 0.2 def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') + self.index = MultiIndex( + levels=[np.arange(10), np.arange(100), np.arange(100)], + labels=[np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)]) + random.shuffle(self.index.values) + self.df = DataFrame(np.random.randn(len(self.index), 4), + index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), + index=self.index.levels[1]) - def time_reindex_fillna_pad(self): - self.ts3.fillna(method='pad') + def time_align_level(self): + self.df.align(self.df_level, level=1, copy=False) - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + def time_reindex_level(self): + self.df_level.reindex(self.df.index, level=1) - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') +#---------------------------------------------------------------------- +# drop_duplicates -class reindex_fillna_pad_float32(object): + +class Duplicates(object): goal_time = 0.2 def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, + 'value': np.random.randn((self.N * self.K)),}) + self.col_array_list = list(self.df.values.T) - def time_reindex_fillna_pad_float32(self): - self.ts4.fillna(method='pad') + self.df2 = self.df.copy() + self.df2.ix[:10000, :] = np.nan - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + self.s = Series(np.random.randint(0, 1000, size=10000)) + self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') + np.random.seed(1234) + self.N = 1000000 + self.K = 10000 + self.key1 = np.random.randint(0, self.K, size=self.N) + self.df_int = DataFrame({'key1': self.key1}) + def time_frame_drop_dups(self): + self.df.drop_duplicates(['key1', 'key2']) -class reindex_frame_level_align(object): - goal_time = 0.2 + def time_frame_drop_dups_inplace(self): + self.df.drop_duplicates(['key1', 'key2'], inplace=True) - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def time_frame_drop_dups_na(self): + self.df2.drop_duplicates(['key1', 'key2']) - def time_reindex_frame_level_align(self): - self.df.align(self.df_level, level=1, copy=False) + def time_frame_drop_dups_na_inplace(self): + self.df2.drop_duplicates(['key1', 'key2'], inplace=True) + def time_series_drop_dups_int(self): + self.s.drop_duplicates() -class reindex_frame_level_reindex(object): - goal_time = 0.2 + def time_series_drop_dups_string(self): + self.s2.drop_duplicates() - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def time_frame_drop_dups_int(self): + self.df_int.drop_duplicates() - def time_reindex_frame_level_reindex(self): - self.df_level.reindex(self.df.index, level=1) +#---------------------------------------------------------------------- +# blog "pandas escaped the zoo" -class reindex_multiindex(object): + +class Align(object): goal_time = 0.2 def setup(self): - self.N = 1000 - self.K = 20 - self.level1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.level2 = np.tile(tm.makeStringIndex(self.K).values, self.N) - self.index = MultiIndex.from_arrays([self.level1, self.level2]) - self.s1 = Series(np.random.randn((self.N * self.K)), index=self.index) - self.s2 = self.s1[::2] - - def time_reindex_multiindex(self): - self.s1.reindex(self.s2.index) - + n = 50000 + indices = tm.makeStringIndex(n) + subsample_size = 40000 -class series_align_irregular_string(object): - goal_time = 0.2 + def sample(values, k): + sampler = np.arange(len(values)) + shuffle(sampler) + return values.take(sampler[:k]) - def setup(self): - self.n = 50000 - self.indices = tm.makeStringIndex(self.n) - self.subsample_size = 40000 - self.x = Series(np.random.randn(50000), self.indices) - self.y = Series(np.random.randn(self.subsample_size), index=self.sample(self.indices, self.subsample_size)) + self.x = Series(np.random.randn(50000), indices) + self.y = Series(np.random.randn(subsample_size), + index=sample(indices, subsample_size)) - def time_series_align_irregular_string(self): + def time_align_series_irregular_string(self): (self.x + self.y) - def sample(self, values, k): - self.sampler = np.arange(len(values)) - shuffle(self.sampler) - return values.take(self.sampler[:k]) - -class series_drop_duplicates_int(object): +class LibFastZip(object): goal_time = 0.2 def setup(self): - self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - - def time_series_drop_duplicates_int(self): - self.s.drop_duplicates() - + self.N = 10000 + self.K = 10 + self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) + self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) + self.col_array_list = list(self.df.values.T) -class series_drop_duplicates_string(object): - goal_time = 0.2 + self.df2 = self.df.copy() + self.df2.ix[:10000, :] = np.nan + self.col_array_list2 = list(self.df2.values.T) - def setup(self): - self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + def time_lib_fast_zip(self): + lib.fast_zip(self.col_array_list) - def time_series_drop_duplicates_string(self): - self.s2.drop_duplicates() + def time_lib_fast_zip_fillna(self): + lib.fast_zip_fillna(self.col_array_list2) From fbe8c5ed2d7948b4b917da9fafe86b06ec97f661 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 18 Nov 2016 10:48:43 +0100 Subject: [PATCH 19/20] Clean-up groupby benchmarks --- asv_bench/benchmarks/groupby.py | 477 +++++++++++++------------------- 1 file changed, 190 insertions(+), 287 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5f3671012e6d5..ad58cd0fc6d70 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -35,107 +35,67 @@ def time_groupby_apply_dict_return(self): #---------------------------------------------------------------------- # groups -class groupby_groups(object): +class Groups(object): goal_time = 0.1 - def setup(self): - size = 2**22 - self.data = Series(np.random.randint(0, 100, size=size)) - self.data2 = Series(np.random.randint(0, 10000, size=size)) - self.data3 = Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))) - self.data4 = Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))) - - def time_groupby_groups_int64_small(self): - self.data.groupby(self.data).groups + size = 2 ** 22 + data = { + 'int64_small': Series(np.random.randint(0, 100, size=size)), + 'int64_large' : Series(np.random.randint(0, 10000, size=size)), + 'object_small': Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))), + 'object_large': Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))) + } - def time_groupby_groups_int64_large(self): - self.data2.groupby(self.data2).groups + param_names = ['df'] + params = ['int64_small', 'int64_large', 'object_small', 'object_large'] - def time_groupby_groups_object_small(self): - self.data3.groupby(self.data3).groups + def setup(self, df): + self.df = self.data[df] - def time_groupby_groups_object_large(self): - self.data4.groupby(self.data4).groups + def time_groupby_groups(self, df): + self.df.groupby(self.df).groups #---------------------------------------------------------------------- # First / last functions -class groupby_first_last(object): - goal_time = 0.2 - - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) - - def time_groupby_first_float32(self): - self.data2.groupby(self.labels).first() - - def time_groupby_first_float64(self): - self.data.groupby(self.labels).first() - - def time_groupby_last_float32(self): - self.data2.groupby(self.labels).last() - - def time_groupby_last_float64(self): - self.data.groupby(self.labels).last() - - def time_groupby_nth_float32_any(self): - self.data2.groupby(self.labels).nth(0, dropna='all') - - def time_groupby_nth_float32_none(self): - self.data2.groupby(self.labels).nth(0) - - def time_groupby_nth_float64_any(self): - self.data.groupby(self.labels).nth(0, dropna='all') - - def time_groupby_nth_float64_none(self): - self.data.groupby(self.labels).nth(0) - -# with datetimes (GH7555) - -class groupby_first_last_datetimes(object): +class FirstLast(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) - - def time_groupby_first_datetimes(self): - self.df.groupby('b').first() + param_names = ['dtype'] + params = ['float32', 'float64', 'datetime', 'object'] - def time_groupby_last_datetimes(self): - self.df.groupby('b').last() + # with datetimes (GH7555) - def time_groupby_nth_datetimes_any(self): - self.df.groupby('b').nth(0, dropna='all') + def setup(self, dtype): - def time_groupby_nth_datetimes_none(self): - self.df.groupby('b').nth(0) - - -class groupby_first_last_object(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000)}) + if dtype == 'datetime': + self.df = DataFrame( + {'values': date_range('1/1/2011', periods=100000, freq='s'), + 'key': range(100000),}) + elif dtype == 'object': + self.df = DataFrame( + {'values': (['foo'] * 100000), + 'key': range(100000)}) + else: + labels = np.arange(10000).repeat(10) + data = Series(randn(len(labels)), dtype=dtype) + data[::3] = np.nan + data[1::3] = np.nan + labels = labels.take(np.random.permutation(len(labels))) + self.df = DataFrame({'values': data, 'key': labels}) - def time_groupby_first_object(self): - self.df.groupby('b').first() + def time_groupby_first(self, dtype): + self.df.groupby('key').first() - def time_groupby_last_object(self): - self.df.groupby('b').last() + def time_groupby_last(self, dtype): + self.df.groupby('key').last() - def time_groupby_nth_object_any(self): - self.df.groupby('b').nth(0, dropna='any') + def time_groupby_nth_any(self, dtype): + self.df.groupby('key').nth(0, dropna='all') - def time_groupby_nth_object_none(self): - self.df.groupby('b').nth(0) + def time_groupby_nth_none(self, dtype): + self.df.groupby('key').nth(0) #---------------------------------------------------------------------- @@ -189,24 +149,6 @@ def time_sum(self): self.df.groupby(self.labels).sum() -#---------------------------------------------------------------------- -# median - -class groupby_frame(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(100000, 2) - self.labels = np.random.randint(0, 1000, size=100000) - self.df = DataFrame(self.data) - - def time_groupby_frame_median(self): - self.df.groupby(self.labels).median() - - def time_groupby_simple_compress_timing(self): - self.df.groupby(self.labels).mean() - - #---------------------------------------------------------------------- # DataFrame nth @@ -405,132 +347,118 @@ def time_groupby_dt_timegrouper_size(self): #---------------------------------------------------------------------- # groupby with a variable value for ngroups -class groupby_ngroups_int_10000(object): +class GroupBySuite(object): goal_time = 0.2 - dtype = 'int' - ngroups = 10000 - def setup(self): + param_names = ['dtype', 'ngroups'] + params = [['int', 'float'], [100, 10000]] + + def setup(self, dtype, ngroups): np.random.seed(1234) - size = self.ngroups * 2 - rng = np.arange(self.ngroups) - ts = rng.take(np.random.randint(0, self.ngroups, size=size)) - if self.dtype == 'int': - value = np.random.randint(0, size, size=size) + size = ngroups * 2 + rng = np.arange(ngroups) + values = rng.take(np.random.randint(0, ngroups, size=size)) + if dtype == 'int': + key = np.random.randint(0, size, size=size) else: - value = np.concatenate([np.random.random(self.ngroups) * 0.1, - np.random.random(self.ngroups) * 10.0]) + key = np.concatenate([np.random.random(ngroups) * 0.1, + np.random.random(ngroups) * 10.0]) - self.df = DataFrame({'timestamp': ts, - 'value': value}) + self.df = DataFrame({'values': values, + 'key': key}) - def time_all(self): - self.df.groupby('value')['timestamp'].all() + def time_all(self, dtype, ngroups): + self.df.groupby('key')['values'].all() - def time_any(self): - self.df.groupby('value')['timestamp'].any() + def time_any(self, dtype, ngroups): + self.df.groupby('key')['values'].any() - def time_count(self): - self.df.groupby('value')['timestamp'].count() + def time_count(self, dtype, ngroups): + self.df.groupby('key')['values'].count() - def time_cumcount(self): - self.df.groupby('value')['timestamp'].cumcount() + def time_cumcount(self, dtype, ngroups): + self.df.groupby('key')['values'].cumcount() - def time_cummax(self): - self.df.groupby('value')['timestamp'].cummax() + def time_cummax(self, dtype, ngroups): + self.df.groupby('key')['values'].cummax() - def time_cummin(self): - self.df.groupby('value')['timestamp'].cummin() + def time_cummin(self, dtype, ngroups): + self.df.groupby('key')['values'].cummin() - def time_cumprod(self): - self.df.groupby('value')['timestamp'].cumprod() + def time_cumprod(self, dtype, ngroups): + self.df.groupby('key')['values'].cumprod() - def time_cumsum(self): - self.df.groupby('value')['timestamp'].cumsum() + def time_cumsum(self, dtype, ngroups): + self.df.groupby('key')['values'].cumsum() - def time_describe(self): - self.df.groupby('value')['timestamp'].describe() + def time_describe(self, dtype, ngroups): + self.df.groupby('key')['values'].describe() - def time_diff(self): - self.df.groupby('value')['timestamp'].diff() + def time_diff(self, dtype, ngroups): + self.df.groupby('key')['values'].diff() - def time_first(self): - self.df.groupby('value')['timestamp'].first() + def time_first(self, dtype, ngroups): + self.df.groupby('key')['values'].first() - def time_head(self): - self.df.groupby('value')['timestamp'].head() + def time_head(self, dtype, ngroups): + self.df.groupby('key')['values'].head() - def time_last(self): - self.df.groupby('value')['timestamp'].last() + def time_last(self, dtype, ngroups): + self.df.groupby('key')['values'].last() - def time_mad(self): - self.df.groupby('value')['timestamp'].mad() + def time_mad(self, dtype, ngroups): + self.df.groupby('key')['values'].mad() - def time_max(self): - self.df.groupby('value')['timestamp'].max() + def time_max(self, dtype, ngroups): + self.df.groupby('key')['values'].max() - def time_mean(self): - self.df.groupby('value')['timestamp'].mean() + def time_mean(self, dtype, ngroups): + self.df.groupby('key')['values'].mean() - def time_median(self): - self.df.groupby('value')['timestamp'].median() + def time_median(self, dtype, ngroups): + self.df.groupby('key')['values'].median() - def time_min(self): - self.df.groupby('value')['timestamp'].min() + def time_min(self, dtype, ngroups): + self.df.groupby('key')['values'].min() - def time_nunique(self): - self.df.groupby('value')['timestamp'].nunique() + def time_nunique(self, dtype, ngroups): + self.df.groupby('key')['values'].nunique() - def time_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() + def time_pct_change(self, dtype, ngroups): + self.df.groupby('key')['values'].pct_change() - def time_prod(self): - self.df.groupby('value')['timestamp'].prod() + def time_prod(self, dtype, ngroups): + self.df.groupby('key')['values'].prod() - def time_rank(self): - self.df.groupby('value')['timestamp'].rank() + def time_rank(self, dtype, ngroups): + self.df.groupby('key')['values'].rank() - def time_sem(self): - self.df.groupby('value')['timestamp'].sem() + def time_sem(self, dtype, ngroups): + self.df.groupby('key')['values'].sem() - def time_size(self): - self.df.groupby('value')['timestamp'].size() + def time_size(self, dtype, ngroups): + self.df.groupby('key')['values'].size() - def time_skew(self): - self.df.groupby('value')['timestamp'].skew() + def time_skew(self, dtype, ngroups): + self.df.groupby('key')['values'].skew() - def time_std(self): - self.df.groupby('value')['timestamp'].std() - - def time_sum(self): - self.df.groupby('value')['timestamp'].sum() + def time_std(self, dtype, ngroups): + self.df.groupby('key')['values'].std() - def time_tail(self): - self.df.groupby('value')['timestamp'].tail() + def time_sum(self, dtype, ngroups): + self.df.groupby('key')['values'].sum() - def time_unique(self): - self.df.groupby('value')['timestamp'].unique() + def time_tail(self, dtype, ngroups): + self.df.groupby('key')['values'].tail() - def time_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() + def time_unique(self, dtype, ngroups): + self.df.groupby('key')['values'].unique() - def time_var(self): - self.df.groupby('value')['timestamp'].var() - -class groupby_ngroups_int_100(groupby_ngroups_int_10000): - goal_time = 0.2 - dtype = 'int' - ngroups = 100 - -class groupby_ngroups_float_100(groupby_ngroups_int_10000): - goal_time = 0.2 - dtype = 'float' - ngroups = 100 + def time_value_counts(self, dtype, ngroups): + self.df.groupby('key')['values'].value_counts() -class groupby_ngroups_float_10000(groupby_ngroups_int_10000): - goal_time = 0.2 - dtype = 'float' - ngroups = 10000 + def time_var(self, dtype, ngroups): + self.df.groupby('key')['values'].var() class groupby_float32(object): @@ -647,89 +575,75 @@ def time_groupby_sum_multiindex(self): #------------------------------------------------------------------------------- # Transform testing -class groupby_transform(object): +class Transform(object): goal_time = 0.2 def setup(self): - self.n_dates = 400 - self.n_securities = 250 - self.n_columns = 3 - self.share_na = 0.1 - self.dates = date_range('1997-12-31', periods=self.n_dates, freq='B') - self.dates = Index(map((lambda x: (((x.year * 10000) + (x.month * 100)) + x.day)), self.dates)) - self.secid_min = int('10000000', 16) - self.secid_max = int('F0000000', 16) - self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1)) - self.security_ids = map((lambda x: hex(x)[2:10].upper()), list(range(self.secid_min, (self.secid_max + 1), self.step))) - self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], - labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (list(range(self.n_securities)) * self.n_dates)], - names=['date', 'security_id']) - self.n_data = len(self.data_index) - self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))]) - self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns) - self.step = int((self.n_data * self.share_na)) - for column_index in range(self.n_columns): - self.index = column_index - while (self.index < self.n_data): - self.data.set_value(self.data_index[self.index], self.columns[column_index], np.nan) - self.index += self.step - self.f_fillna = (lambda x: x.fillna(method='pad')) - - def time_groupby_transform(self): - self.data.groupby(level='security_id').transform(self.f_fillna) + n1 = 400 + n2 = 250 - def time_groupby_transform_ufunc(self): - self.data.groupby(level='date').transform(np.max) + index = MultiIndex( + levels=[np.arange(n1), pd.util.testing.makeStringIndex(n2)], + labels=[[i for i in range(n1) for _ in range(n2)], + (list(range(n2)) * n1)], + names=['lev1', 'lev2']) + data = DataFrame(np.random.randn(n1 * n2, 3), + index=index, columns=['col1', 'col20', 'col3']) + step = int((n1 * n2 * 0.1)) + for col in range(len(data.columns)): + idx = col + while (idx < len(data)): + data.set_value(data.index[idx], data.columns[col], np.nan) + idx += step + self.df = data + self.f_fillna = (lambda x: x.fillna(method='pad')) -class groupby_transform_multi_key(object): - goal_time = 0.2 - - def setup(self): np.random.seed(2718281) - self.n = 20000 - self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie']) + n = 20000 + self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), + columns=['jim', 'joe', 'jolie']) + self.df2 = self.df1.copy() + self.df2['jim'] = self.df2['joe'] - def time_groupby_transform_multi_key1(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)), + columns=['jim', 'joe', 'jolie']) + self.df4 = self.df3.copy() + self.df4['jim'] = self.df4['joe'] + def time_transform_func(self): + self.df.groupby(level='lev2').transform(self.f_fillna) -class groupby_transform_multi_key2(object): - goal_time = 0.2 + def time_transform_ufunc(self): + self.df.groupby(level='lev1').transform(np.max) - def setup(self): - np.random.seed(2718281) - self.n = 20000 - self.df = DataFrame(np.random.randint(1, self.n, (self.n, 3)), columns=['jim', 'joe', 'jolie']) - self.df['jim'] = self.df['joe'] + def time_transform_multi_key1(self): + self.df1.groupby(['jim', 'joe'])['jolie'].transform('max') - def time_groupby_transform_multi_key2(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') + def time_transform_multi_key2(self): + self.df2.groupby(['jim', 'joe'])['jolie'].transform('max') + def time_transform_multi_key3(self): + self.df3.groupby(['jim', 'joe'])['jolie'].transform('max') -class groupby_transform_multi_key3(object): - goal_time = 0.2 + def time_transform_multi_key4(self): + self.df4.groupby(['jim', 'joe'])['jolie'].transform('max') - def setup(self): - np.random.seed(2718281) - self.n = 200000 - self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie']) - def time_groupby_transform_multi_key3(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') -class groupby_transform_multi_key4(object): - goal_time = 0.2 +np.random.seed(0) +N = 120000 +N_TRANSITIONS = 1400 +transition_points = np.random.permutation(np.arange(N))[:N_TRANSITIONS] +transition_points.sort() +transitions = np.zeros((N,), dtype=np.bool) +transitions[transition_points] = True +g = transitions.cumsum() +df = DataFrame({'signal': np.random.rand(N), }) + - def setup(self): - np.random.seed(2718281) - self.n = 200000 - self.df = DataFrame(np.random.randint(1, (self.n / 10), (self.n, 3)), columns=['jim', 'joe', 'jolie']) - self.df['jim'] = self.df['joe'] - def time_groupby_transform_multi_key4(self): - self.df.groupby(['jim', 'joe'])['jolie'].transform('max') class groupby_transform_series(object): @@ -737,14 +651,12 @@ class groupby_transform_series(object): def setup(self): np.random.seed(0) - self.N = 120000 - self.N_TRANSITIONS = 1400 - self.transition_points = np.random.permutation(np.arange(self.N))[:self.N_TRANSITIONS] - self.transition_points.sort() - self.transitions = np.zeros((self.N,), dtype=np.bool) - self.transitions[self.transition_points] = True - self.g = self.transitions.cumsum() - self.df = DataFrame({'signal': np.random.rand(self.N), }) + N = 120000 + transition_points = np.sort(np.random.choice(np.arange(N), 1400)) + transitions = np.zeros((N,), dtype=np.bool) + transitions[transition_points] = True + self.g = transitions.cumsum() + self.df = DataFrame({'signal': np.random.rand(N)}) def time_groupby_transform_series(self): self.df['signal'].groupby(self.g).transform(np.mean) @@ -755,38 +667,29 @@ class groupby_transform_series2(object): def setup(self): np.random.seed(0) - self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), }) + self.df = DataFrame({'key': (np.arange(100000) // 3), + 'val': np.random.randn(100000)}) - def time_groupby_transform_series2(self): - self.df.groupby('id')['val'].transform(np.mean) + self.df_nans = pd.DataFrame({'key': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df_nans.ix[4::10, 'B':'C'] = 5 + def time_transform_series2(self): + self.df.groupby('key')['val'].transform(np.mean) -class groupby_transform_dataframe(object): - # GH 12737 - goal_time = 0.2 - - def setup(self): - self.df = pd.DataFrame({'group': np.repeat(np.arange(1000), 10), - 'B': np.nan, - 'C': np.nan}) - self.df.ix[4::10, 'B':'C'] = 5 - - def time_groupby_transform_dataframe(self): - self.df.groupby('group').transform('first') + def time_cumprod(self): + self.df.groupby('key').cumprod() + def time_cumsum(self): + self.df.groupby('key').cumsum() -class groupby_transform_cythonized(object): - goal_time = 0.2 + def time_shift(self): + self.df.groupby('key').shift() - def setup(self): - np.random.seed(0) - self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), }) + def time_transform_dataframe(self): + # GH 12737 + self.df_nans.groupby('key').transform('first') - def time_groupby_transform_cumprod(self): - self.df.groupby('id').cumprod() - def time_groupby_transform_cumsum(self): - self.df.groupby('id').cumsum() - def time_groupby_transform_shift(self): - self.df.groupby('id').shift() From 0748cb01d390cde6434182ebe6eb1625e0948feb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 10 Dec 2016 15:10:29 +0100 Subject: [PATCH 20/20] Clean-up HDFStore benchmarks --- asv_bench/benchmarks/hdfstore_bench.py | 331 ++++--------------------- 1 file changed, 51 insertions(+), 280 deletions(-) diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index 659fc4941da54..78de5267a2969 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -2,186 +2,45 @@ import os -class query_store_table(object): +class HDF5(object): goal_time = 0.2 def setup(self): - self.f = '__test__.h5' - self.index = date_range('1/1/2000', periods=25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df12', self.df) - - def time_query_store_table(self): - self.store.select('df12', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])]) + self.index = tm.makeStringIndex(25000) + self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000),}, + index=self.index) - def teardown(self): - self.store.close() + self.df_mixed = DataFrame( + {'float1': randn(25000), 'float2': randn(25000), + 'string1': (['foo'] * 25000), + 'bool1': ([True] * 25000), + 'int1': np.random.randint(0, 250000, size=25000),}, + index=self.index) - def remove(self, f): - try: - os.remove(self.f) - except: - pass + self.df_wide = DataFrame(np.random.randn(25000, 100)) + self.df2 = DataFrame({'float1': randn(25000), 'float2': randn(25000)}, + index=date_range('1/1/2000', periods=25000)) + self.df_wide2 = DataFrame(np.random.randn(25000, 100), + index=date_range('1/1/2000', periods=25000)) -class query_store_table_wide(object): - goal_time = 0.2 + self.df_dc = DataFrame(np.random.randn(10000, 10), + columns=[('C%03d' % i) for i in range(10)]) - def setup(self): self.f = '__test__.h5' - self.index = date_range('1/1/2000', periods=25000) - self.df = DataFrame(np.random.randn(25000, 100), index=self.index) self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df11', self.df) - def time_query_store_table_wide(self): - self.store.select('df11', [('index', '>', self.df.index[10000]), ('index', '<', self.df.index[15000])]) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) self.store = HDFStore(self.f) self.store.put('df1', self.df) + self.store.put('df_mixed', self.df_mixed) - def time_read_store(self): - self.store.get('df1') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.put('df3', self.df) - - def time_read_store_mixed(self): - self.store.get('df3') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) + self.store.append('df5', self.df_mixed) self.store.append('df7', self.df) - def time_read_store_table(self): - self.store.select('df7') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.N = 10000 - self.index = tm.makeStringIndex(self.N) - self.df = DataFrame({'float1': randn(self.N), 'float2': randn(self.N), 'string1': (['foo'] * self.N), 'bool1': ([True] * self.N), 'int1': np.random.randint(0, self.N, size=self.N), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df5', self.df) - - def time_read_store_table_mixed(self): - self.store.select('df5') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_panel(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in range(25)]) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('p1', self.p) - - def time_read_store_table_panel(self): - self.store.select('p1') - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class read_store_table_wide(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(25000, 100)) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('df9', self.df) + self.store.append('df9', self.df_wide) - def time_read_store_table_wide(self): - self.store.select('df9') + self.store.append('df11', self.df_wide2) + self.store.append('df12', self.df2) def teardown(self): self.store.close() @@ -192,110 +51,60 @@ def remove(self, f): except: pass + def time_read_store(self): + self.store.get('df1') -class write_store(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) + def time_read_store_mixed(self): + self.store.get('df_mixed') def time_write_store(self): self.store.put('df2', self.df) - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_mixed(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 250000, size=25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) - def time_write_store_mixed(self): - self.store.put('df4', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass + self.store.put('df_mixed2', self.df_mixed) + def time_read_store_table_mixed(self): + self.store.select('df5') -class write_store_table(object): - goal_time = 0.2 + def time_write_store_table_mixed(self): + self.store.append('df6', self.df_mixed) - def setup(self): - self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), }, index=self.index) - self.remove(self.f) - self.store = HDFStore(self.f) + def time_read_store_table(self): + self.store.select('df7') def time_write_store_table(self): self.store.append('df8', self.df) - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table_dc(object): - goal_time = 0.2 + def time_read_store_table_wide(self): + self.store.select('df9') - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(10000, 10), columns=[('C%03d' % i) for i in range(10)]) - self.remove(self.f) - self.store = HDFStore(self.f) + def time_write_store_table_wide(self): + self.store.append('df10', self.df_wide) def time_write_store_table_dc(self): self.store.append('df15', self.df, data_columns=True) - def teardown(self): - self.store.close() + def time_query_store_table_wide(self): + self.store.select('df11', [('index', '>', self.df_wide2.index[10000]), + ('index', '<', self.df_wide2.index[15000])]) - def remove(self, f): - try: - os.remove(self.f) - except: - pass + def time_query_store_table(self): + self.store.select('df12', [('index', '>', self.df2.index[10000]), + ('index', '<', self.df2.index[15000])]) -class write_store_table_mixed(object): +class HDF5Panel(object): goal_time = 0.2 def setup(self): self.f = '__test__.h5' - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000), 'string1': (['foo'] * 25000), 'bool1': ([True] * 25000), 'int1': np.random.randint(0, 25000, size=25000), }, index=self.index) + self.p = Panel(randn(20, 1000, 25), + items=[('Item%03d' % i) for i in range(20)], + major_axis=date_range('1/1/2000', periods=1000), + minor_axis=[('E%03d' % i) for i in range(25)]) self.remove(self.f) self.store = HDFStore(self.f) - - def time_write_store_table_mixed(self): - self.store.append('df6', self.df) + self.store.append('p1', self.p) def teardown(self): self.store.close() @@ -306,46 +115,8 @@ def remove(self, f): except: pass - -class write_store_table_panel(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.p = Panel(randn(20, 1000, 25), items=[('Item%03d' % i) for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=[('E%03d' % i) for i in range(25)]) - self.remove(self.f) - self.store = HDFStore(self.f) + def time_read_store_table_panel(self): + self.store.select('p1') def time_write_store_table_panel(self): self.store.append('p2', self.p) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - -class write_store_table_wide(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.df = DataFrame(np.random.randn(25000, 100)) - self.remove(self.f) - self.store = HDFStore(self.f) - - def time_write_store_table_wide(self): - self.store.append('df10', self.df) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass