diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 1e1eb167b46bf..c447c78d0d070 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,205 +1,114 @@ -from .pandas_vb_common import * +import numpy as np +import pandas as pd +from .pandas_vb_common import setup # noqa -def _set_use_bottleneck_False(): - try: - pd.options.compute.use_bottleneck = False - except: - from pandas.core import nanops - nanops._USE_BOTTLENECK = False +ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem', + 'var'] -class FrameOps(object): - goal_time = 0.2 - - param_names = ['op', 'use_bottleneck', 'dtype', 'axis'] - params = [['mean', 'sum', 'median'], - [True, False], - ['float', 'int'], - [0, 1]] - - def setup(self, op, use_bottleneck, dtype, axis): - if dtype == 'float': - self.df = DataFrame(np.random.randn(100000, 4)) - elif dtype == 'int': - self.df = DataFrame(np.random.randint(1000, size=(100000, 4))) - - if not use_bottleneck: - _set_use_bottleneck_False() - - self.func = getattr(self.df, op) - - def time_op(self, op, use_bottleneck, dtype, axis): - self.func(axis=axis) +class FrameOps(object): -class stat_ops_level_frame_sum(object): goal_time = 0.2 + params = [ops, ['float', 'int'], [0, 1], [True, False]] + param_names = ['op', 'dtype', 'axis', 'use_bottleneck'] - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_frame_sum(self): - self.df.sum(level=1) - - -class stat_ops_level_frame_sum_multiple(object): - goal_time = 0.2 + def setup(self, op, dtype, axis, use_bottleneck): + df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.df_func = getattr(df, op) - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def time_op(self, op, dtype, axis, use_bottleneck): + self.df_func(axis=axis) - def time_stat_ops_level_frame_sum_multiple(self): - self.df.sum(level=[0, 1]) +class FrameMultiIndexOps(object): -class stat_ops_level_series_sum(object): goal_time = 0.2 + params = ([0, 1, [0, 1]], ops) + param_names = ['level', 'op'] - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def setup(self, level, op): + levels = [np.arange(10), np.arange(100), np.arange(100)] + labels = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, labels=labels) + df = pd.DataFrame(np.random.randn(len(index), 4), index=index) + self.df_func = getattr(df, op) - def time_stat_ops_level_series_sum(self): - self.df[1].sum(level=1) + def time_op(self, level, op): + self.df_func(level=level) -class stat_ops_level_series_sum_multiple(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_series_sum_multiple(self): - self.df[1].sum(level=[0, 1]) +class SeriesOps(object): - -class stat_ops_series_std(object): goal_time = 0.2 + params = [ops, ['float', 'int'], [True, False]] + param_names = ['op', 'dtype', 'use_bottleneck'] - def setup(self): - self.s = Series(np.random.randn(100000), index=np.arange(100000)) - self.s[::2] = np.nan - - def time_stat_ops_series_std(self): - self.s.std() + def setup(self, op, dtype, use_bottleneck): + s = pd.Series(np.random.randn(100000)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.s_func = getattr(s, op) + def time_op(self, op, dtype, use_bottleneck): + self.s_func() -class stats_corr_spearman(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(np.random.randn(1000, 30)) +class SeriesMultiIndexOps(object): - def time_stats_corr_spearman(self): - self.df.corr(method='spearman') - - -class stats_rank2d_axis0_average(object): goal_time = 0.2 + params = ([0, 1, [0, 1]], ops) + param_names = ['level', 'op'] - def setup(self): - self.df = DataFrame(np.random.randn(5000, 50)) - - def time_stats_rank2d_axis0_average(self): - self.df.rank() + def setup(self, level, op): + levels = [np.arange(10), np.arange(100), np.arange(100)] + labels = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, labels=labels) + s = pd.Series(np.random.randn(len(index)), index=index) + self.s_func = getattr(s, op) + def time_op(self, level, op): + self.s_func(level=level) -class stats_rank2d_axis1_average(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(5000, 50)) - def time_stats_rank2d_axis1_average(self): - self.df.rank(1) +class Rank(object): - -class stats_rank_average(object): - goal_time = 0.2 - - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) - - def time_stats_rank_average(self): - self.s.rank() - - -class stats_rank_average_int(object): - goal_time = 0.2 - - def setup(self): - self.values = np.random.randint(0, 100000, size=200000) - self.s = Series(self.values) - - def time_stats_rank_average_int(self): - self.s.rank() - - -class stats_rank_pct_average(object): goal_time = 0.2 + params = [['DataFrame', 'Series'], [True, False]] + param_names = ['constructor', 'pct'] - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) + def setup(self, constructor, pct): + values = np.random.randn(10**5) + self.data = getattr(pd, constructor)(values) - def time_stats_rank_pct_average(self): - self.s.rank(pct=True) + def time_rank(self, constructor, pct): + self.data.rank(pct=pct) + def time_average_old(self, constructor, pct): + self.data.rank(pct=pct) / len(self.data) -class stats_rank_pct_average_old(object): - goal_time = 0.2 - - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) - - def time_stats_rank_pct_average_old(self): - (self.s.rank() / len(self.s)) +class Correlation(object): -class stats_rolling_mean(object): goal_time = 0.2 + params = ['spearman', 'kendall', 'pearson'] + param_names = ['method'] - def setup(self): - self.arr = np.random.randn(100000) - self.win = 100 - - def time_rolling_mean(self): - rolling_mean(self.arr, self.win) - - def time_rolling_median(self): - rolling_median(self.arr, self.win) - - def time_rolling_min(self): - rolling_min(self.arr, self.win) - - def time_rolling_max(self): - rolling_max(self.arr, self.win) - - def time_rolling_sum(self): - rolling_sum(self.arr, self.win) - - def time_rolling_std(self): - rolling_std(self.arr, self.win) - - def time_rolling_var(self): - rolling_var(self.arr, self.win) - - def time_rolling_skew(self): - rolling_skew(self.arr, self.win) + def setup(self, method): + self.df = pd.DataFrame(np.random.randn(1000, 30)) - def time_rolling_kurt(self): - rolling_kurt(self.arr, self.win) + def time_corr(self, method): + self.df.corr(method=method)