From c10c00e2345b1c396d363b82c6d0cd6f44854bae Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 1 Jan 2018 15:18:56 -0800 Subject: [PATCH 1/4] CLN: ASV stat_ops --- asv_bench/benchmarks/rolling.py | 15 ++ asv_bench/benchmarks/stat_ops.py | 232 ++++++++++--------------------- 2 files changed, 86 insertions(+), 161 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 45142c53dcd01..2ed8c17c37d9f 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -39,3 +39,18 @@ def setup(self, contructor, window, dtype, percentile): def time_quantile(self, contructor, window, dtype, percentile): self.roll.quantile(percentile) + + +class DepreciatedRolling(object): + + sample_time = 0.2 + params = ['rolling_median', 'rolling_mean', 'rolling_min', 'rolling_max', + 'rolling_var', 'rolling_skew', 'rolling_kurt', 'rolling_std'] + param_names = ['method'] + + def setup(self, method): + self.arr = np.random.randn(100000) + self.win = 100 + + def time_method(self, method): + getattr(pd, method)(self.arr, self.win) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 1e1eb167b46bf..35f15460a8da4 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,205 +1,115 @@ -from .pandas_vb_common import * +import numpy as np +import pandas as pd - -def _set_use_bottleneck_False(): - try: - pd.options.compute.use_bottleneck = False - except: - from pandas.core import nanops - nanops._USE_BOTTLENECK = False +from .pandas_vb_common import setup # noqa class FrameOps(object): - goal_time = 0.2 + goal_time = 0.2 param_names = ['op', 'use_bottleneck', 'dtype', 'axis'] - params = [['mean', 'sum', 'median'], + params = [['mean', 'sum', 'median', 'std'], [True, False], ['float', 'int'], [0, 1]] def setup(self, op, use_bottleneck, dtype, axis): - if dtype == 'float': - self.df = DataFrame(np.random.randn(100000, 4)) - elif dtype == 'int': - self.df = DataFrame(np.random.randint(1000, size=(100000, 4))) - - if not use_bottleneck: - _set_use_bottleneck_False() - - self.func = getattr(self.df, op) + df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.df_func = getattr(df, op) def time_op(self, op, use_bottleneck, dtype, axis): - self.func(axis=axis) - - -class stat_ops_level_frame_sum(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_frame_sum(self): - self.df.sum(level=1) - - -class stat_ops_level_frame_sum_multiple(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_frame_sum_multiple(self): - self.df.sum(level=[0, 1]) - - -class stat_ops_level_series_sum(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_series_sum(self): - self.df[1].sum(level=1) - - -class stat_ops_level_series_sum_multiple(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + self.df_func(axis=axis) - def time_stat_ops_level_series_sum_multiple(self): - self.df[1].sum(level=[0, 1]) +class FrameMultiIndexOps(object): -class stat_ops_series_std(object): goal_time = 0.2 + params = ([0, 1, [0, 1]], ['mean', 'sum', 'median']) + param_names = ['level', 'op'] - def setup(self): - self.s = Series(np.random.randn(100000), index=np.arange(100000)) - self.s[::2] = np.nan + def setup(self, level, op): + levels = [np.arange(10), np.arange(100), np.arange(100)] + labels = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, labels=labels) + df = pd.DataFrame(np.random.randn(len(index), 4), index=index) + self.df_func = getattr(df, op) - def time_stat_ops_series_std(self): - self.s.std() + def time_op(self, level, op): + self.df_func(level=level) -class stats_corr_spearman(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 30)) - - def time_stats_corr_spearman(self): - self.df.corr(method='spearman') +class SeriesOps(object): - -class stats_rank2d_axis0_average(object): goal_time = 0.2 + param_names = ['op', 'use_bottleneck', 'dtype'] + params = [['mean', 'sum', 'median', 'std'], + [True, False], + ['float', 'int']] - def setup(self): - self.df = DataFrame(np.random.randn(5000, 50)) - - def time_stats_rank2d_axis0_average(self): - self.df.rank() - - -class stats_rank2d_axis1_average(object): - goal_time = 0.2 + def setup(self, op, use_bottleneck, dtype): + s = pd.Series(np.random.randn(100000)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.s_func = getattr(s, op) - def setup(self): - self.df = DataFrame(np.random.randn(5000, 50)) + def time_op(self, op, use_bottleneck, dtype): + self.s_func() - def time_stats_rank2d_axis1_average(self): - self.df.rank(1) +class SeriesMultiIndexOps(object): -class stats_rank_average(object): goal_time = 0.2 + params = ([0, 1, [0, 1]], ['mean', 'sum', 'median']) + param_names = ['level', 'op'] - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) - - def time_stats_rank_average(self): - self.s.rank() - - -class stats_rank_average_int(object): - goal_time = 0.2 + def setup(self, level, op): + levels = [np.arange(10), np.arange(100), np.arange(100)] + labels = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, labels=labels) + s = pd.Series(np.random.randn(len(index)), index=index) + self.s_func = getattr(s, op) - def setup(self): - self.values = np.random.randint(0, 100000, size=200000) - self.s = Series(self.values) + def time_op(self, level, op): + self.s_func(level=level) - def time_stats_rank_average_int(self): - self.s.rank() +class Rank(object): -class stats_rank_pct_average(object): goal_time = 0.2 + params = [['DataFrame', 'Series'], [True, False]] + param_names = ['constructor', 'pct'] - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) - - def time_stats_rank_pct_average(self): - self.s.rank(pct=True) - + def setup(self, constructor, pct): + values = np.random.randn(10**5) + self.data = getattr(pd, constructor)(values) -class stats_rank_pct_average_old(object): - goal_time = 0.2 + def time_rank(self, constructor, pct): + self.data.rank(pct=pct) - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) + def time_average_old(self, constructor, pct): + self.data.rank(pct=pct) / len(self.data) - def time_stats_rank_pct_average_old(self): - (self.s.rank() / len(self.s)) +class Correlation(object): -class stats_rolling_mean(object): goal_time = 0.2 + params = ['spearman', 'kendall', 'pearson'] + param_names = ['method'] - def setup(self): - self.arr = np.random.randn(100000) - self.win = 100 - - def time_rolling_mean(self): - rolling_mean(self.arr, self.win) - - def time_rolling_median(self): - rolling_median(self.arr, self.win) - - def time_rolling_min(self): - rolling_min(self.arr, self.win) - - def time_rolling_max(self): - rolling_max(self.arr, self.win) - - def time_rolling_sum(self): - rolling_sum(self.arr, self.win) - - def time_rolling_std(self): - rolling_std(self.arr, self.win) - - def time_rolling_var(self): - rolling_var(self.arr, self.win) - - def time_rolling_skew(self): - rolling_skew(self.arr, self.win) + def setup(self, method): + self.df = pd.DataFrame(np.random.randn(1000, 30)) - def time_rolling_kurt(self): - rolling_kurt(self.arr, self.win) + def time_corr(self, method): + self.df.corr(method=method) From 5fd9420affd74e8097e40509fc34c0c0839fd3ce Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 3 Jan 2018 20:16:39 -0800 Subject: [PATCH 2/4] Remove old rolling method benchmarks --- asv_bench/benchmarks/rolling.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 2ed8c17c37d9f..45142c53dcd01 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -39,18 +39,3 @@ def setup(self, contructor, window, dtype, percentile): def time_quantile(self, contructor, window, dtype, percentile): self.roll.quantile(percentile) - - -class DepreciatedRolling(object): - - sample_time = 0.2 - params = ['rolling_median', 'rolling_mean', 'rolling_min', 'rolling_max', - 'rolling_var', 'rolling_skew', 'rolling_kurt', 'rolling_std'] - param_names = ['method'] - - def setup(self, method): - self.arr = np.random.randn(100000) - self.win = 100 - - def time_method(self, method): - getattr(pd, method)(self.arr, self.win) From a44d0597f904ef6025a053d8272f4b074550ee70 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 4 Jan 2018 22:05:12 -0800 Subject: [PATCH 3/4] create bottleneck class --- asv_bench/benchmarks/stat_ops.py | 43 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 35f15460a8da4..089ec5df4ea28 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -4,25 +4,38 @@ from .pandas_vb_common import setup # noqa -class FrameOps(object): +class Bottleneck(object): goal_time = 0.2 - param_names = ['op', 'use_bottleneck', 'dtype', 'axis'] - params = [['mean', 'sum', 'median', 'std'], - [True, False], - ['float', 'int'], - [0, 1]] + params = ([True, False], ['DataFrame', 'Series']) + param_names = ['use_bottleneck', 'constructor'] - def setup(self, op, use_bottleneck, dtype, axis): - df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + def setup(self, use_bottleneck, constructor): + values = np.random.randn(10**6) + self.data = getattr(pd, constructor)(values) try: pd.options.compute.use_bottleneck = use_bottleneck except: from pandas.core import nanops nanops._USE_BOTTLENECK = use_bottleneck + + def time_mean(self, use_bottleneck, constructor): + self.data.mean() + + +class FrameOps(object): + + goal_time = 0.2 + param_names = ['op', 'dtype', 'axis'] + params = [['mean', 'sum', 'median', 'std'], + ['float', 'int'], + [0, 1]] + + def setup(self, op, dtype, axis): + df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) self.df_func = getattr(df, op) - def time_op(self, op, use_bottleneck, dtype, axis): + def time_op(self, op, dtype, axis): self.df_func(axis=axis) @@ -48,21 +61,15 @@ def time_op(self, level, op): class SeriesOps(object): goal_time = 0.2 - param_names = ['op', 'use_bottleneck', 'dtype'] + param_names = ['op', 'dtype'] params = [['mean', 'sum', 'median', 'std'], - [True, False], ['float', 'int']] - def setup(self, op, use_bottleneck, dtype): + def setup(self, op, dtype): s = pd.Series(np.random.randn(100000)).astype(dtype) - try: - pd.options.compute.use_bottleneck = use_bottleneck - except: - from pandas.core import nanops - nanops._USE_BOTTLENECK = use_bottleneck self.s_func = getattr(s, op) - def time_op(self, op, use_bottleneck, dtype): + def time_op(self, op, dtype): self.s_func() From d2593a374283edfde13a730670c4e56f5bbc6fe8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 5 Jan 2018 21:36:42 -0800 Subject: [PATCH 4/4] Add bottleneck params and more ops --- asv_bench/benchmarks/stat_ops.py | 50 ++++++++++++++------------------ 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 089ec5df4ea28..c447c78d0d070 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -4,45 +4,33 @@ from .pandas_vb_common import setup # noqa -class Bottleneck(object): +ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem', + 'var'] + + +class FrameOps(object): goal_time = 0.2 - params = ([True, False], ['DataFrame', 'Series']) - param_names = ['use_bottleneck', 'constructor'] + params = [ops, ['float', 'int'], [0, 1], [True, False]] + param_names = ['op', 'dtype', 'axis', 'use_bottleneck'] - def setup(self, use_bottleneck, constructor): - values = np.random.randn(10**6) - self.data = getattr(pd, constructor)(values) + def setup(self, op, dtype, axis, use_bottleneck): + df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) try: pd.options.compute.use_bottleneck = use_bottleneck except: from pandas.core import nanops nanops._USE_BOTTLENECK = use_bottleneck - - def time_mean(self, use_bottleneck, constructor): - self.data.mean() - - -class FrameOps(object): - - goal_time = 0.2 - param_names = ['op', 'dtype', 'axis'] - params = [['mean', 'sum', 'median', 'std'], - ['float', 'int'], - [0, 1]] - - def setup(self, op, dtype, axis): - df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) self.df_func = getattr(df, op) - def time_op(self, op, dtype, axis): + def time_op(self, op, dtype, axis, use_bottleneck): self.df_func(axis=axis) class FrameMultiIndexOps(object): goal_time = 0.2 - params = ([0, 1, [0, 1]], ['mean', 'sum', 'median']) + params = ([0, 1, [0, 1]], ops) param_names = ['level', 'op'] def setup(self, level, op): @@ -61,22 +49,26 @@ def time_op(self, level, op): class SeriesOps(object): goal_time = 0.2 - param_names = ['op', 'dtype'] - params = [['mean', 'sum', 'median', 'std'], - ['float', 'int']] + params = [ops, ['float', 'int'], [True, False]] + param_names = ['op', 'dtype', 'use_bottleneck'] - def setup(self, op, dtype): + def setup(self, op, dtype, use_bottleneck): s = pd.Series(np.random.randn(100000)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.s_func = getattr(s, op) - def time_op(self, op, dtype): + def time_op(self, op, dtype, use_bottleneck): self.s_func() class SeriesMultiIndexOps(object): goal_time = 0.2 - params = ([0, 1, [0, 1]], ['mean', 'sum', 'median']) + params = ([0, 1, [0, 1]], ops) param_names = ['level', 'op'] def setup(self, level, op):