-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CLN: ASV stat_ops #19049
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
CLN: ASV stat_ops #19049
Changes from 3 commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,205 +1,122 @@ | ||
from .pandas_vb_common import * | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from .pandas_vb_common import setup # noqa | ||
|
||
def _set_use_bottleneck_False(): | ||
try: | ||
pd.options.compute.use_bottleneck = False | ||
except: | ||
from pandas.core import nanops | ||
nanops._USE_BOTTLENECK = False | ||
|
||
class Bottleneck(object): | ||
|
||
class FrameOps(object): | ||
goal_time = 0.2 | ||
params = ([True, False], ['DataFrame', 'Series']) | ||
param_names = ['use_bottleneck', 'constructor'] | ||
|
||
param_names = ['op', 'use_bottleneck', 'dtype', 'axis'] | ||
params = [['mean', 'sum', 'median'], | ||
[True, False], | ||
['float', 'int'], | ||
[0, 1]] | ||
|
||
def setup(self, op, use_bottleneck, dtype, axis): | ||
if dtype == 'float': | ||
self.df = DataFrame(np.random.randn(100000, 4)) | ||
elif dtype == 'int': | ||
self.df = DataFrame(np.random.randint(1000, size=(100000, 4))) | ||
|
||
if not use_bottleneck: | ||
_set_use_bottleneck_False() | ||
def setup(self, use_bottleneck, constructor): | ||
values = np.random.randn(10**6) | ||
self.data = getattr(pd, constructor)(values) | ||
try: | ||
pd.options.compute.use_bottleneck = use_bottleneck | ||
except: | ||
from pandas.core import nanops | ||
nanops._USE_BOTTLENECK = use_bottleneck | ||
|
||
self.func = getattr(self.df, op) | ||
def time_mean(self, use_bottleneck, constructor): | ||
self.data.mean() | ||
|
||
def time_op(self, op, use_bottleneck, dtype, axis): | ||
self.func(axis=axis) | ||
|
||
class FrameOps(object): | ||
|
||
class stat_ops_level_frame_sum(object): | ||
goal_time = 0.2 | ||
param_names = ['op', 'dtype', 'axis'] | ||
params = [['mean', 'sum', 'median', 'std'], | ||
['float', 'int'], | ||
[0, 1]] | ||
|
||
def setup(self): | ||
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) | ||
random.shuffle(self.index.values) | ||
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) | ||
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) | ||
|
||
def time_stat_ops_level_frame_sum(self): | ||
self.df.sum(level=1) | ||
|
||
|
||
class stat_ops_level_frame_sum_multiple(object): | ||
goal_time = 0.2 | ||
def setup(self, op, dtype, axis): | ||
df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) | ||
self.df_func = getattr(df, op) | ||
|
||
def setup(self): | ||
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) | ||
random.shuffle(self.index.values) | ||
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) | ||
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) | ||
def time_op(self, op, dtype, axis): | ||
self.df_func(axis=axis) | ||
|
||
def time_stat_ops_level_frame_sum_multiple(self): | ||
self.df.sum(level=[0, 1]) | ||
|
||
class FrameMultiIndexOps(object): | ||
|
||
class stat_ops_level_series_sum(object): | ||
goal_time = 0.2 | ||
params = ([0, 1, [0, 1]], ['mean', 'sum', 'median']) | ||
param_names = ['level', 'op'] | ||
|
||
def setup(self): | ||
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) | ||
random.shuffle(self.index.values) | ||
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) | ||
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) | ||
def setup(self, level, op): | ||
levels = [np.arange(10), np.arange(100), np.arange(100)] | ||
labels = [np.arange(10).repeat(10000), | ||
np.tile(np.arange(100).repeat(100), 10), | ||
np.tile(np.tile(np.arange(100), 100), 10)] | ||
index = pd.MultiIndex(levels=levels, labels=labels) | ||
df = pd.DataFrame(np.random.randn(len(index), 4), index=index) | ||
self.df_func = getattr(df, op) | ||
|
||
def time_stat_ops_level_series_sum(self): | ||
self.df[1].sum(level=1) | ||
def time_op(self, level, op): | ||
self.df_func(level=level) | ||
|
||
|
||
class stat_ops_level_series_sum_multiple(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) | ||
random.shuffle(self.index.values) | ||
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) | ||
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) | ||
|
||
def time_stat_ops_level_series_sum_multiple(self): | ||
self.df[1].sum(level=[0, 1]) | ||
class SeriesOps(object): | ||
|
||
|
||
class stat_ops_series_std(object): | ||
goal_time = 0.2 | ||
param_names = ['op', 'dtype'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what I meant about bottleneck was make it a parameter in ops, e.g. here |
||
params = [['mean', 'sum', 'median', 'std'], | ||
['float', 'int']] | ||
|
||
def setup(self): | ||
self.s = Series(np.random.randn(100000), index=np.arange(100000)) | ||
self.s[::2] = np.nan | ||
|
||
def time_stat_ops_series_std(self): | ||
self.s.std() | ||
def setup(self, op, dtype): | ||
s = pd.Series(np.random.randn(100000)).astype(dtype) | ||
self.s_func = getattr(s, op) | ||
|
||
def time_op(self, op, dtype): | ||
self.s_func() | ||
|
||
class stats_corr_spearman(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
self.df = DataFrame(np.random.randn(1000, 30)) | ||
class SeriesMultiIndexOps(object): | ||
|
||
def time_stats_corr_spearman(self): | ||
self.df.corr(method='spearman') | ||
|
||
|
||
class stats_rank2d_axis0_average(object): | ||
goal_time = 0.2 | ||
params = ([0, 1, [0, 1]], ['mean', 'sum', 'median']) | ||
param_names = ['level', 'op'] | ||
|
||
def setup(self): | ||
self.df = DataFrame(np.random.randn(5000, 50)) | ||
|
||
def time_stats_rank2d_axis0_average(self): | ||
self.df.rank() | ||
def setup(self, level, op): | ||
levels = [np.arange(10), np.arange(100), np.arange(100)] | ||
labels = [np.arange(10).repeat(10000), | ||
np.tile(np.arange(100).repeat(100), 10), | ||
np.tile(np.tile(np.arange(100), 100), 10)] | ||
index = pd.MultiIndex(levels=levels, labels=labels) | ||
s = pd.Series(np.random.randn(len(index)), index=index) | ||
self.s_func = getattr(s, op) | ||
|
||
def time_op(self, level, op): | ||
self.s_func(level=level) | ||
|
||
class stats_rank2d_axis1_average(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
self.df = DataFrame(np.random.randn(5000, 50)) | ||
|
||
def time_stats_rank2d_axis1_average(self): | ||
self.df.rank(1) | ||
class Rank(object): | ||
|
||
|
||
class stats_rank_average(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) | ||
self.s = Series(self.values) | ||
|
||
def time_stats_rank_average(self): | ||
self.s.rank() | ||
|
||
|
||
class stats_rank_average_int(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
self.values = np.random.randint(0, 100000, size=200000) | ||
self.s = Series(self.values) | ||
|
||
def time_stats_rank_average_int(self): | ||
self.s.rank() | ||
|
||
|
||
class stats_rank_pct_average(object): | ||
goal_time = 0.2 | ||
params = [['DataFrame', 'Series'], [True, False]] | ||
param_names = ['constructor', 'pct'] | ||
|
||
def setup(self): | ||
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) | ||
self.s = Series(self.values) | ||
def setup(self, constructor, pct): | ||
values = np.random.randn(10**5) | ||
self.data = getattr(pd, constructor)(values) | ||
|
||
def time_stats_rank_pct_average(self): | ||
self.s.rank(pct=True) | ||
def time_rank(self, constructor, pct): | ||
self.data.rank(pct=pct) | ||
|
||
def time_average_old(self, constructor, pct): | ||
self.data.rank(pct=pct) / len(self.data) | ||
|
||
class stats_rank_pct_average_old(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) | ||
self.s = Series(self.values) | ||
|
||
def time_stats_rank_pct_average_old(self): | ||
(self.s.rank() / len(self.s)) | ||
|
||
class Correlation(object): | ||
|
||
class stats_rolling_mean(object): | ||
goal_time = 0.2 | ||
params = ['spearman', 'kendall', 'pearson'] | ||
param_names = ['method'] | ||
|
||
def setup(self): | ||
self.arr = np.random.randn(100000) | ||
self.win = 100 | ||
|
||
def time_rolling_mean(self): | ||
rolling_mean(self.arr, self.win) | ||
|
||
def time_rolling_median(self): | ||
rolling_median(self.arr, self.win) | ||
|
||
def time_rolling_min(self): | ||
rolling_min(self.arr, self.win) | ||
|
||
def time_rolling_max(self): | ||
rolling_max(self.arr, self.win) | ||
|
||
def time_rolling_sum(self): | ||
rolling_sum(self.arr, self.win) | ||
|
||
def time_rolling_std(self): | ||
rolling_std(self.arr, self.win) | ||
|
||
def time_rolling_var(self): | ||
rolling_var(self.arr, self.win) | ||
|
||
def time_rolling_skew(self): | ||
rolling_skew(self.arr, self.win) | ||
def setup(self, method): | ||
self.df = pd.DataFrame(np.random.randn(1000, 30)) | ||
|
||
def time_rolling_kurt(self): | ||
rolling_kurt(self.arr, self.win) | ||
def time_corr(self, method): | ||
self.df.corr(method=method) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and use bottleneck here, also expand these to all of the stat ops (min, max, var, kurt, etc)