diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index f1ac09b8b2516..138977a29463e 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -3,7 +3,7 @@ from itertools import product -class groupby_agg_builtins1(object): +class groupby_agg_builtins(object): goal_time = 0.2 def setup(self): @@ -14,18 +14,11 @@ def setup(self): def time_groupby_agg_builtins1(self): self.df.groupby('jim').agg([sum, min, max]) - -class groupby_agg_builtins2(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(27182) - self.n = 100000 - self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie']) - def time_groupby_agg_builtins2(self): self.df.groupby(['jim', 'joe']).agg([sum, min, max]) +#---------------------------------------------------------------------- +# dict return values class groupby_apply_dict_return(object): goal_time = 0.2 @@ -39,33 +32,49 @@ def time_groupby_apply_dict_return(self): self.data.groupby(self.labels).apply(self.f) -class groupby_dt_size(object): +#---------------------------------------------------------------------- +# First / last functions + +class groupby_first_last(object): goal_time = 0.2 def setup(self): - self.n = 100000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + self.labels = np.arange(10000).repeat(10) + self.data = Series(randn(len(self.labels))) + self.data[::3] = np.nan + self.data[1::3] = np.nan + self.data2 = Series(randn(len(self.labels)), dtype='float32') + self.data2[::3] = np.nan + self.data2[1::3] = np.nan + self.labels = self.labels.take(np.random.permutation(len(self.labels))) - def time_groupby_dt_size(self): - self.df.groupby(['dates']).size() + def time_groupby_first_float32(self): + self.data2.groupby(self.labels).first() + def time_groupby_first_float64(self): + self.data.groupby(self.labels).first() -class groupby_dt_timegrouper_size(object): - goal_time = 0.2 + def time_groupby_last_float32(self): + self.data2.groupby(self.labels).last() - def setup(self): - self.n = 100000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + def time_groupby_last_float64(self): + self.data.groupby(self.labels).last() - def time_groupby_dt_timegrouper_size(self): - self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + def time_groupby_nth_float32_any(self): + self.data2.groupby(self.labels).nth(0, dropna='all') + + def time_groupby_nth_float32_none(self): + self.data2.groupby(self.labels).nth(0) + + def time_groupby_nth_float64_any(self): + self.data.groupby(self.labels).nth(0, dropna='all') + + def time_groupby_nth_float64_none(self): + self.data.groupby(self.labels).nth(0) +# with datetimes (GH7555) -class groupby_first_datetimes(object): +class groupby_first_last_datetimes(object): goal_time = 0.2 def setup(self): @@ -74,50 +83,37 @@ def setup(self): def time_groupby_first_datetimes(self): self.df.groupby('b').first() + def time_groupby_last_datetimes(self): + self.df.groupby('b').last() -class groupby_first_float32(object): - goal_time = 0.2 - - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) + def time_groupby_nth_datetimes_any(self): + self.df.groupby('b').nth(0, dropna='all') - def time_groupby_first_float32(self): - self.data2.groupby(self.labels).first() + def time_groupby_nth_datetimes_none(self): + self.df.groupby('b').nth(0) -class groupby_first_float64(object): +class groupby_first_last_object(object): goal_time = 0.2 def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) + self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000)}) - def time_groupby_first_float64(self): - self.data.groupby(self.labels).first() + def time_groupby_first_object(self): + self.df.groupby('b').first() + def time_groupby_last_object(self): + self.df.groupby('b').last() -class groupby_first_object(object): - goal_time = 0.2 + def time_groupby_nth_object_any(self): + self.df.groupby('b').nth(0, dropna='any') - def setup(self): - self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), }) + def time_groupby_nth_object_none(self): + self.df.groupby('b').nth(0) - def time_groupby_first_object(self): - self.df.groupby('b').first() +#---------------------------------------------------------------------- +# DataFrame Apply overhead class groupby_frame_apply(object): goal_time = 0.2 @@ -128,28 +124,18 @@ def setup(self): self.labels2 = np.random.randint(0, 3, size=self.N) self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), }) - def time_groupby_frame_apply(self): - self.df.groupby(['key', 'key2']).apply(self.f) - def f(self, g): return 1 - -class groupby_frame_apply_overhead(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.labels = np.random.randint(0, 2000, size=self.N) - self.labels2 = np.random.randint(0, 3, size=self.N) - self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), }) + def time_groupby_frame_apply(self): + self.df.groupby(['key', 'key2']).apply(self.f) def time_groupby_frame_apply_overhead(self): self.df.groupby('key').apply(self.f) - def f(self, g): - return 1 +#---------------------------------------------------------------------- +# 2d grouping, aggregate many columns class groupby_frame_cython_many_columns(object): goal_time = 0.2 @@ -158,53 +144,67 @@ def setup(self): self.labels = np.random.randint(0, 100, size=1000) self.df = DataFrame(randn(1000, 1000)) - def time_groupby_frame_cython_many_columns(self): + def time_sum(self): self.df.groupby(self.labels).sum() -class groupby_frame_median(object): +#---------------------------------------------------------------------- +# single key, long, integer key + +class groupby_frame_singlekey_integer(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(100000, 2) + self.data = np.random.randn(100000, 1) self.labels = np.random.randint(0, 1000, size=100000) self.df = DataFrame(self.data) - def time_groupby_frame_median(self): - self.df.groupby(self.labels).median() + def time_sum(self): + self.df.groupby(self.labels).sum() + +#---------------------------------------------------------------------- +# median -class groupby_frame_nth_any(object): +class groupby_frame(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) + self.data = np.random.randn(100000, 2) + self.labels = np.random.randint(0, 1000, size=100000) + self.df = DataFrame(self.data) + + def time_groupby_frame_median(self): + self.df.groupby(self.labels).median() + + def time_groupby_simple_compress_timing(self): + self.df.groupby(self.labels).mean() - def time_groupby_frame_nth_any(self): - self.df.groupby(0).nth(0, dropna='any') +#---------------------------------------------------------------------- +# DataFrame nth -class groupby_frame_nth_none(object): +class groupby_nth(object): goal_time = 0.2 def setup(self): self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) + def time_groupby_frame_nth_any(self): + self.df.groupby(0).nth(0, dropna='any') + def time_groupby_frame_nth_none(self): self.df.groupby(0).nth(0) + def time_groupby_series_nth_any(self): + self.df[1].groupby(self.df[0]).nth(0, dropna='any') -class groupby_frame_singlekey_integer(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(100000, 1) - self.labels = np.random.randint(0, 1000, size=100000) - self.df = DataFrame(self.data) + def time_groupby_series_nth_none(self): + self.df[1].groupby(self.df[0]).nth(0) - def time_groupby_frame_singlekey_integer(self): - self.df.groupby(self.labels).sum() +#---------------------------------------------------------------------- +# groupby_indices replacement, chop up Series class groupby_indices(object): goal_time = 0.2 @@ -240,70 +240,8 @@ def time_groupby_int64_overflow(self): self.df.groupby(list('abcde')).max() -class groupby_int_count(object): - goal_time = 0.2 - - def setup(self): - self.n = 10000 - self.df = DataFrame({'key1': randint(0, 500, size=self.n), 'key2': randint(0, 100, size=self.n), 'ints': randint(0, 1000, size=self.n), 'ints2': randint(0, 1000, size=self.n), }) - - def time_groupby_int_count(self): - self.df.groupby(['key1', 'key2']).count() - - -class groupby_last_datetimes(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) - - def time_groupby_last_datetimes(self): - self.df.groupby('b').last() - - -class groupby_last_float32(object): - goal_time = 0.2 - - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) - - def time_groupby_last_float32(self): - self.data2.groupby(self.labels).last() - - -class groupby_last_float64(object): - goal_time = 0.2 - - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) - - def time_groupby_last_float64(self): - self.data.groupby(self.labels).last() - - -class groupby_last_object(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), }) - - def time_groupby_last_object(self): - self.df.groupby('b').last() - +#---------------------------------------------------------------------- +# count() speed class groupby_multi_count(object): goal_time = 0.2 @@ -318,38 +256,37 @@ def setup(self): self.value2[(np.random.rand(self.n) > 0.5)] = np.nan self.obj = tm.choice(list('ab'), size=self.n).astype(object) self.obj[(np.random.randn(self.n) > 0.5)] = np.nan - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'dates': self.dates, 'value2': self.value2, 'value3': np.random.randn(self.n), 'ints': np.random.randint(0, 1000, size=self.n), 'obj': self.obj, 'offsets': self.offsets, }) + self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), + 'key2': np.random.randint(0, 100, size=self.n), + 'dates': self.dates, + 'value2': self.value2, + 'value3': np.random.randn(self.n), + 'ints': np.random.randint(0, 1000, size=self.n), + 'obj': self.obj, + 'offsets': self.offsets, }) def time_groupby_multi_count(self): self.df.groupby(['key1', 'key2']).count() -class groupby_multi_cython(object): +class groupby_int_count(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.ngroups = 100 - self.df = DataFrame({'key1': self.get_test_data(ngroups=self.ngroups), 'key2': self.get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) - self.simple_series = Series(np.random.randn(self.N)) - self.key1 = self.df['key1'] - - def time_groupby_multi_cython(self): - self.df.groupby(['key1', 'key2']).sum() + self.n = 10000 + self.df = DataFrame({'key1': randint(0, 500, size=self.n), + 'key2': randint(0, 100, size=self.n), + 'ints': randint(0, 1000, size=self.n), + 'ints2': randint(0, 1000, size=self.n), }) - def get_test_data(self, ngroups=100, n=100000): - self.unique_groups = range(self.ngroups) - self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) - if (len(self.arr) < n): - self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) - random.shuffle(self.arr) - return self.arr + def time_groupby_int_count(self): + self.df.groupby(['key1', 'key2']).count() - def f(self): - self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) +#---------------------------------------------------------------------- +# group with different functions per column -class groupby_multi_different_functions(object): +class groupby_agg_multi(object): goal_time = 0.2 def setup(self): @@ -358,19 +295,10 @@ def setup(self): self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) def time_groupby_multi_different_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': 'mean', 'value2': 'var', 'value3': 'sum', }) - - -class groupby_multi_different_numpy_functions(object): - goal_time = 0.2 - - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) + self.df.groupby(['key1', 'key2']).agg({'value1': 'mean', 'value2': 'var', 'value3': 'sum'}) def time_groupby_multi_different_numpy_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': np.mean, 'value2': np.var, 'value3': np.sum, }) + self.df.groupby(['key1', 'key2']).agg({'value1': np.mean, 'value2': np.var, 'value3': np.sum}) class groupby_multi_index(object): @@ -389,7 +317,7 @@ def time_groupby_multi_index(self): self.df.groupby(list('abcd')).max() -class groupby_multi_python(object): +class groupby_multi(object): goal_time = 0.2 def setup(self): @@ -399,9 +327,6 @@ def setup(self): self.simple_series = Series(np.random.randn(self.N)) self.key1 = self.df['key1'] - def time_groupby_multi_python(self): - self.df.groupby(['key1', 'key2'])['data1'].agg((lambda x: x.values.sum())) - def get_test_data(self, ngroups=100, n=100000): self.unique_groups = range(self.ngroups) self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) @@ -413,33 +338,26 @@ def get_test_data(self, ngroups=100, n=100000): def f(self): self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) + def time_groupby_multi_cython(self): + self.df.groupby(['key1', 'key2']).sum() -class groupby_multi_series_op(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.ngroups = 100 - self.df = DataFrame({'key1': self.get_test_data(ngroups=self.ngroups), 'key2': self.get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) - self.simple_series = Series(np.random.randn(self.N)) - self.key1 = self.df['key1'] + def time_groupby_multi_python(self): + self.df.groupby(['key1', 'key2'])['data1'].agg((lambda x: x.values.sum())) def time_groupby_multi_series_op(self): self.df.groupby(['key1', 'key2'])['data1'].agg(np.std) - def get_test_data(self, ngroups=100, n=100000): - self.unique_groups = range(self.ngroups) - self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) - if (len(self.arr) < n): - self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) - random.shuffle(self.arr) - return self.arr + def time_groupby_series_simple_cython(self): + self.simple_series.groupby(self.key1).sum() - def f(self): - self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) + def time_groupby_series_simple_rank(self): + self.df.groupby('key1').rank(pct=True) -class groupby_multi_size(object): +#---------------------------------------------------------------------- +# size() speed + +class groupby_size(object): goal_time = 0.2 def setup(self): @@ -451,22 +369,17 @@ def setup(self): def time_groupby_multi_size(self): self.df.groupby(['key1', 'key2']).size() + def time_groupby_dt_size(self): + self.df.groupby(['dates']).size() -class groupby_ngroups_10000_all(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + def time_groupby_dt_timegrouper_size(self): + self.df.groupby(TimeGrouper(key='dates', freq='M')).size() - def time_groupby_ngroups_10000_all(self): - self.df.groupby('value')['timestamp'].all() +#---------------------------------------------------------------------- +# groupby with a variable value for ngroups -class groupby_ngroups_10000_any(object): +class groupby_ngroups_10000(object): goal_time = 0.2 def setup(self): @@ -476,809 +389,101 @@ def setup(self): self.rng = np.arange(self.ngroups) self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - def time_groupby_ngroups_10000_any(self): - self.df.groupby('value')['timestamp'].any() - - -class groupby_ngroups_10000_count(object): - goal_time = 0.2 + def time_all(self): + self.df.groupby('value')['timestamp'].all() - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + def time_any(self): + self.df.groupby('value')['timestamp'].any() - def time_groupby_ngroups_10000_count(self): + def time_count(self): self.df.groupby('value')['timestamp'].count() - -class groupby_ngroups_10000_cumcount(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_cumcount(self): + def time_cumcount(self): self.df.groupby('value')['timestamp'].cumcount() - -class groupby_ngroups_10000_cummax(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_cummax(self): + def time_cummax(self): self.df.groupby('value')['timestamp'].cummax() - -class groupby_ngroups_10000_cummin(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_cummin(self): + def time_cummin(self): self.df.groupby('value')['timestamp'].cummin() - -class groupby_ngroups_10000_cumprod(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_cumprod(self): + def time_cumprod(self): self.df.groupby('value')['timestamp'].cumprod() - -class groupby_ngroups_10000_cumsum(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_cumsum(self): + def time_cumsum(self): self.df.groupby('value')['timestamp'].cumsum() - -class groupby_ngroups_10000_describe(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_describe(self): + def time_describe(self): self.df.groupby('value')['timestamp'].describe() - -class groupby_ngroups_10000_diff(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_diff(self): + def time_diff(self): self.df.groupby('value')['timestamp'].diff() - -class groupby_ngroups_10000_first(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_first(self): + def time_first(self): self.df.groupby('value')['timestamp'].first() - -class groupby_ngroups_10000_head(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_head(self): + def time_head(self): self.df.groupby('value')['timestamp'].head() - -class groupby_ngroups_10000_last(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_last(self): + def time_last(self): self.df.groupby('value')['timestamp'].last() - -class groupby_ngroups_10000_mad(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_mad(self): + def time_mad(self): self.df.groupby('value')['timestamp'].mad() - -class groupby_ngroups_10000_max(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_max(self): + def time_max(self): self.df.groupby('value')['timestamp'].max() - -class groupby_ngroups_10000_mean(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_mean(self): + def time_mean(self): self.df.groupby('value')['timestamp'].mean() - -class groupby_ngroups_10000_median(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_median(self): + def time_median(self): self.df.groupby('value')['timestamp'].median() - -class groupby_ngroups_10000_min(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_min(self): + def time_min(self): self.df.groupby('value')['timestamp'].min() - -class groupby_ngroups_10000_nunique(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_nunique(self): + def time_nunique(self): self.df.groupby('value')['timestamp'].nunique() + def time_pct_change(self): + self.df.groupby('value')['timestamp'].pct_change() -class groupby_ngroups_10000_pct_change(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() - - -class groupby_ngroups_10000_prod(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_prod(self): - self.df.groupby('value')['timestamp'].prod() - - -class groupby_ngroups_10000_rank(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_rank(self): - self.df.groupby('value')['timestamp'].rank() - - -class groupby_ngroups_10000_sem(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_sem(self): - self.df.groupby('value')['timestamp'].sem() - - -class groupby_ngroups_10000_size(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_size(self): - self.df.groupby('value')['timestamp'].size() - - -class groupby_ngroups_10000_skew(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_skew(self): - self.df.groupby('value')['timestamp'].skew() - - -class groupby_ngroups_10000_std(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_std(self): - self.df.groupby('value')['timestamp'].std() - - -class groupby_ngroups_10000_sum(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_sum(self): - self.df.groupby('value')['timestamp'].sum() - - -class groupby_ngroups_10000_tail(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_tail(self): - self.df.groupby('value')['timestamp'].tail() - - -class groupby_ngroups_10000_unique(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_unique(self): - self.df.groupby('value')['timestamp'].unique() - - -class groupby_ngroups_10000_value_counts(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() - - -class groupby_ngroups_10000_var(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_10000_var(self): - self.df.groupby('value')['timestamp'].var() - - -class groupby_ngroups_100_all(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_all(self): - self.df.groupby('value')['timestamp'].all() - - -class groupby_ngroups_100_any(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_any(self): - self.df.groupby('value')['timestamp'].any() - - -class groupby_ngroups_100_count(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_count(self): - self.df.groupby('value')['timestamp'].count() - - -class groupby_ngroups_100_cumcount(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_cumcount(self): - self.df.groupby('value')['timestamp'].cumcount() - - -class groupby_ngroups_100_cummax(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_cummax(self): - self.df.groupby('value')['timestamp'].cummax() - - -class groupby_ngroups_100_cummin(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_cummin(self): - self.df.groupby('value')['timestamp'].cummin() - - -class groupby_ngroups_100_cumprod(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_cumprod(self): - self.df.groupby('value')['timestamp'].cumprod() - - -class groupby_ngroups_100_cumsum(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_cumsum(self): - self.df.groupby('value')['timestamp'].cumsum() - - -class groupby_ngroups_100_describe(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_describe(self): - self.df.groupby('value')['timestamp'].describe() - - -class groupby_ngroups_100_diff(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_diff(self): - self.df.groupby('value')['timestamp'].diff() - - -class groupby_ngroups_100_first(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_first(self): - self.df.groupby('value')['timestamp'].first() - - -class groupby_ngroups_100_head(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_head(self): - self.df.groupby('value')['timestamp'].head() - - -class groupby_ngroups_100_last(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_last(self): - self.df.groupby('value')['timestamp'].last() - - -class groupby_ngroups_100_mad(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_mad(self): - self.df.groupby('value')['timestamp'].mad() - - -class groupby_ngroups_100_max(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_max(self): - self.df.groupby('value')['timestamp'].max() - - -class groupby_ngroups_100_mean(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_mean(self): - self.df.groupby('value')['timestamp'].mean() - - -class groupby_ngroups_100_median(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_median(self): - self.df.groupby('value')['timestamp'].median() - - -class groupby_ngroups_100_min(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_min(self): - self.df.groupby('value')['timestamp'].min() - - -class groupby_ngroups_100_nunique(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_nunique(self): - self.df.groupby('value')['timestamp'].nunique() - - -class groupby_ngroups_100_pct_change(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() - - -class groupby_ngroups_100_prod(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_prod(self): - self.df.groupby('value')['timestamp'].prod() - - -class groupby_ngroups_100_rank(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_rank(self): - self.df.groupby('value')['timestamp'].rank() - - -class groupby_ngroups_100_sem(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_sem(self): - self.df.groupby('value')['timestamp'].sem() - - -class groupby_ngroups_100_size(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_size(self): - self.df.groupby('value')['timestamp'].size() - - -class groupby_ngroups_100_skew(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_skew(self): - self.df.groupby('value')['timestamp'].skew() - - -class groupby_ngroups_100_std(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_groupby_ngroups_100_std(self): - self.df.groupby('value')['timestamp'].std() + def time_prod(self): + self.df.groupby('value')['timestamp'].prod() + def time_rank(self): + self.df.groupby('value')['timestamp'].rank() -class groupby_ngroups_100_sum(object): - goal_time = 0.2 + def time_sem(self): + self.df.groupby('value')['timestamp'].sem() - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + def time_size(self): + self.df.groupby('value')['timestamp'].size() + + def time_skew(self): + self.df.groupby('value')['timestamp'].skew() + + def time_std(self): + self.df.groupby('value')['timestamp'].std() - def time_groupby_ngroups_100_sum(self): + def time_sum(self): self.df.groupby('value')['timestamp'].sum() + def time_tail(self): + self.df.groupby('value')['timestamp'].tail() -class groupby_ngroups_100_tail(object): - goal_time = 0.2 + def time_unique(self): + self.df.groupby('value')['timestamp'].unique() - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + def time_value_counts(self): + self.df.groupby('value')['timestamp'].value_counts() - def time_groupby_ngroups_100_tail(self): - self.df.groupby('value')['timestamp'].tail() + def time_var(self): + self.df.groupby('value')['timestamp'].var() -class groupby_ngroups_100_unique(object): +class groupby_ngroups_100(object): goal_time = 0.2 def setup(self): @@ -1288,145 +493,127 @@ def setup(self): self.rng = np.arange(self.ngroups) self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - def time_groupby_ngroups_100_unique(self): - self.df.groupby('value')['timestamp'].unique() - + def time_all(self): + self.df.groupby('value')['timestamp'].all() -class groupby_ngroups_100_value_counts(object): - goal_time = 0.2 + def time_any(self): + self.df.groupby('value')['timestamp'].any() - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + def time_count(self): + self.df.groupby('value')['timestamp'].count() - def time_groupby_ngroups_100_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() + def time_cumcount(self): + self.df.groupby('value')['timestamp'].cumcount() + def time_cummax(self): + self.df.groupby('value')['timestamp'].cummax() -class groupby_ngroups_100_var(object): - goal_time = 0.2 + def time_cummin(self): + self.df.groupby('value')['timestamp'].cummin() - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + def time_cumprod(self): + self.df.groupby('value')['timestamp'].cumprod() - def time_groupby_ngroups_100_var(self): - self.df.groupby('value')['timestamp'].var() + def time_cumsum(self): + self.df.groupby('value')['timestamp'].cumsum() + def time_describe(self): + self.df.groupby('value')['timestamp'].describe() -class groupby_nth_datetimes_any(object): - goal_time = 0.2 + def time_diff(self): + self.df.groupby('value')['timestamp'].diff() - def setup(self): - self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) + def time_first(self): + self.df.groupby('value')['timestamp'].first() - def time_groupby_nth_datetimes_any(self): - self.df.groupby('b').nth(0, dropna='all') + def time_head(self): + self.df.groupby('value')['timestamp'].head() + def time_last(self): + self.df.groupby('value')['timestamp'].last() -class groupby_nth_datetimes_none(object): - goal_time = 0.2 + def time_mad(self): + self.df.groupby('value')['timestamp'].mad() - def setup(self): - self.df = DataFrame({'a': date_range('1/1/2011', periods=100000, freq='s'), 'b': range(100000), }) + def time_max(self): + self.df.groupby('value')['timestamp'].max() - def time_groupby_nth_datetimes_none(self): - self.df.groupby('b').nth(0) + def time_mean(self): + self.df.groupby('value')['timestamp'].mean() + def time_median(self): + self.df.groupby('value')['timestamp'].median() -class groupby_nth_float32_any(object): - goal_time = 0.2 + def time_min(self): + self.df.groupby('value')['timestamp'].min() - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) + def time_nunique(self): + self.df.groupby('value')['timestamp'].nunique() - def time_groupby_nth_float32_any(self): - self.data2.groupby(self.labels).nth(0, dropna='all') + def time_pct_change(self): + self.df.groupby('value')['timestamp'].pct_change() + def time_prod(self): + self.df.groupby('value')['timestamp'].prod() -class groupby_nth_float32_none(object): - goal_time = 0.2 + def time_rank(self): + self.df.groupby('value')['timestamp'].rank() - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) + def time_sem(self): + self.df.groupby('value')['timestamp'].sem() - def time_groupby_nth_float32_none(self): - self.data2.groupby(self.labels).nth(0) + def time_size(self): + self.df.groupby('value')['timestamp'].size() + def time_skew(self): + self.df.groupby('value')['timestamp'].skew() -class groupby_nth_float64_any(object): - goal_time = 0.2 + def time_std(self): + self.df.groupby('value')['timestamp'].std() - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) + def time_sum(self): + self.df.groupby('value')['timestamp'].sum() - def time_groupby_nth_float64_any(self): - self.data.groupby(self.labels).nth(0, dropna='all') + def time_tail(self): + self.df.groupby('value')['timestamp'].tail() + def time_unique(self): + self.df.groupby('value')['timestamp'].unique() -class groupby_nth_float64_none(object): - goal_time = 0.2 + def time_value_counts(self): + self.df.groupby('value')['timestamp'].value_counts() - def setup(self): - self.labels = np.arange(10000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.data[::3] = np.nan - self.data[1::3] = np.nan - self.data2 = Series(randn(len(self.labels)), dtype='float32') - self.data2[::3] = np.nan - self.data2[1::3] = np.nan - self.labels = self.labels.take(np.random.permutation(len(self.labels))) + def time_var(self): + self.df.groupby('value')['timestamp'].var() - def time_groupby_nth_float64_none(self): - self.data.groupby(self.labels).nth(0) +#---------------------------------------------------------------------- +# Series.value_counts -class groupby_nth_object_any(object): +class series_value_counts(object): goal_time = 0.2 def setup(self): - self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), }) + self.s = Series(np.random.randint(0, 1000, size=100000)) + self.s2 = self.s.astype(float) - def time_groupby_nth_object_any(self): - self.df.groupby('b').nth(0, dropna='any') + self.K = 1000 + self.N = 100000 + self.uniques = tm.makeStringIndex(self.K).values + self.s3 = Series(np.tile(self.uniques, (self.N // self.K))) + def time_value_counts_int64(self): + self.s.value_counts() -class groupby_nth_object_none(object): - goal_time = 0.2 + def time_value_counts_float64(self): + self.s2.value_counts() - def setup(self): - self.df = DataFrame({'a': (['foo'] * 100000), 'b': range(100000), }) + def time_value_counts_strings(self): + self.s.value_counts() - def time_groupby_nth_object_none(self): - self.df.groupby('b').nth(0) +#---------------------------------------------------------------------- +# pivot_table class groupby_pivot_table(object): goal_time = 0.2 @@ -1442,62 +629,8 @@ def time_groupby_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3']) -class groupby_series_nth_any(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) - - def time_groupby_series_nth_any(self): - self.df[1].groupby(self.df[0]).nth(0, dropna='any') - - -class groupby_series_nth_none(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) - - def time_groupby_series_nth_none(self): - self.df[1].groupby(self.df[0]).nth(0) - - -class groupby_series_simple_cython(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.ngroups = 100 - self.df = DataFrame({'key1': self.get_test_data(ngroups=self.ngroups), 'key2': self.get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) - self.simple_series = Series(np.random.randn(self.N)) - self.key1 = self.df['key1'] - - def time_groupby_series_simple_cython(self): - self.df.groupby('key1').rank(pct=True) - - def get_test_data(self, ngroups=100, n=100000): - self.unique_groups = range(self.ngroups) - self.arr = np.asarray(np.tile(self.unique_groups, (n / self.ngroups)), dtype=object) - if (len(self.arr) < n): - self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) - random.shuffle(self.arr) - return self.arr - - def f(self): - self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) - - -class groupby_simple_compress_timing(object): - goal_time = 0.2 - - def setup(self): - self.data = np.random.randn(1000000, 2) - self.labels = np.random.randint(0, 1000, size=1000000) - self.df = DataFrame(self.data) - - def time_groupby_simple_compress_timing(self): - self.df.groupby(self.labels).mean() - +#---------------------------------------------------------------------- +# Sum booleans #2692 class groupby_sum_booleans(object): goal_time = 0.2 @@ -1510,6 +643,9 @@ def time_groupby_sum_booleans(self): self.df.groupby('ii').sum() +#---------------------------------------------------------------------- +# multi-indexed group sum #9049 + class groupby_sum_multiindex(object): goal_time = 0.2 @@ -1521,6 +657,9 @@ def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() +#------------------------------------------------------------------------------- +# Transform testing + class groupby_transform(object): goal_time = 0.2 @@ -1535,7 +674,9 @@ def setup(self): self.secid_max = int('F0000000', 16) self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1)) self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step)) - self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], labels=[[i for i in range(self.n_dates) for _ in xrange(self.n_securities)], (range(self.n_securities) * self.n_dates)], names=['date', 'security_id']) + self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], + labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (range(self.n_securities) * self.n_dates)], + names=['date', 'security_id']) self.n_data = len(self.data_index) self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))]) self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns) @@ -1550,8 +691,11 @@ def setup(self): def time_groupby_transform(self): self.data.groupby(level='security_id').transform(self.f_fillna) + def time_groupby_transform_ufunc(self): + self.data.groupby(level='date').transform(np.max) -class groupby_transform_multi_key1(object): + +class groupby_transform_multi_key(object): goal_time = 0.2 def setup(self): @@ -1628,66 +772,3 @@ def setup(self): def time_groupby_transform_series2(self): self.df.groupby('id')['val'].transform(np.mean) - - -class groupby_transform_ufunc(object): - goal_time = 0.2 - - def setup(self): - self.n_dates = 400 - self.n_securities = 250 - self.n_columns = 3 - self.share_na = 0.1 - self.dates = date_range('1997-12-31', periods=self.n_dates, freq='B') - self.dates = Index(map((lambda x: (((x.year * 10000) + (x.month * 100)) + x.day)), self.dates)) - self.secid_min = int('10000000', 16) - self.secid_max = int('F0000000', 16) - self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1)) - self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step)) - self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], labels=[[i for i in range(self.n_dates) for _ in xrange(self.n_securities)], (range(self.n_securities) * self.n_dates)], names=['date', 'security_id']) - self.n_data = len(self.data_index) - self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))]) - self.data = DataFrame(np.random.randn(self.n_data, self.n_columns), index=self.data_index, columns=self.columns) - self.step = int((self.n_data * self.share_na)) - for column_index in range(self.n_columns): - self.index = column_index - while (self.index < self.n_data): - self.data.set_value(self.data_index[self.index], self.columns[column_index], np.nan) - self.index += self.step - self.f_fillna = (lambda x: x.fillna(method='pad')) - - def time_groupby_transform_ufunc(self): - self.data.groupby(level='date').transform(np.max) - - -class series_value_counts_float64(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.randint(0, 1000, size=100000)).astype(float) - - def time_series_value_counts_float64(self): - self.s.value_counts() - - -class series_value_counts_int64(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.randint(0, 1000, size=100000)) - - def time_series_value_counts_int64(self): - self.s.value_counts() - - -class series_value_counts_strings(object): - goal_time = 0.2 - - def setup(self): - self.K = 1000 - self.N = 100000 - self.uniques = tm.makeStringIndex(self.K).values - self.s = Series(np.tile(self.uniques, (self.N // self.K))) - - def time_series_value_counts_strings(self): - self.s.value_counts()