Skip to content

CLN: ASV stat_ops #19049

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 6, 2018
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 71 additions & 161 deletions asv_bench/benchmarks/stat_ops.py
Original file line number Diff line number Diff line change
@@ -1,205 +1,115 @@
from .pandas_vb_common import *
import numpy as np
import pandas as pd


def _set_use_bottleneck_False():
try:
pd.options.compute.use_bottleneck = False
except:
from pandas.core import nanops
nanops._USE_BOTTLENECK = False
from .pandas_vb_common import setup # noqa


class FrameOps(object):
goal_time = 0.2

goal_time = 0.2
param_names = ['op', 'use_bottleneck', 'dtype', 'axis']
params = [['mean', 'sum', 'median'],
params = [['mean', 'sum', 'median', 'std'],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could make use_bottleneck a param here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use_bottleneck is a param here, over [True, False]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and use bottleneck here, also expand these to all of the stat ops (min, max, var, kurt, etc)

[True, False],
['float', 'int'],
[0, 1]]

def setup(self, op, use_bottleneck, dtype, axis):
if dtype == 'float':
self.df = DataFrame(np.random.randn(100000, 4))
elif dtype == 'int':
self.df = DataFrame(np.random.randint(1000, size=(100000, 4)))

if not use_bottleneck:
_set_use_bottleneck_False()

self.func = getattr(self.df, op)
df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
try:
pd.options.compute.use_bottleneck = use_bottleneck
except:
from pandas.core import nanops
nanops._USE_BOTTLENECK = use_bottleneck
self.df_func = getattr(df, op)

def time_op(self, op, use_bottleneck, dtype, axis):
self.func(axis=axis)


class stat_ops_level_frame_sum(object):
goal_time = 0.2

def setup(self):
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])

def time_stat_ops_level_frame_sum(self):
self.df.sum(level=1)


class stat_ops_level_frame_sum_multiple(object):
goal_time = 0.2

def setup(self):
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])

def time_stat_ops_level_frame_sum_multiple(self):
self.df.sum(level=[0, 1])


class stat_ops_level_series_sum(object):
goal_time = 0.2

def setup(self):
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])

def time_stat_ops_level_series_sum(self):
self.df[1].sum(level=1)


class stat_ops_level_series_sum_multiple(object):
goal_time = 0.2

def setup(self):
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
self.df_func(axis=axis)

def time_stat_ops_level_series_sum_multiple(self):
self.df[1].sum(level=[0, 1])

class FrameMultiIndexOps(object):

class stat_ops_series_std(object):
goal_time = 0.2
params = ([0, 1, [0, 1]], ['mean', 'sum', 'median'])
param_names = ['level', 'op']

def setup(self):
self.s = Series(np.random.randn(100000), index=np.arange(100000))
self.s[::2] = np.nan
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
labels = [np.arange(10).repeat(10000),
np.tile(np.arange(100).repeat(100), 10),
np.tile(np.tile(np.arange(100), 100), 10)]
index = pd.MultiIndex(levels=levels, labels=labels)
df = pd.DataFrame(np.random.randn(len(index), 4), index=index)
self.df_func = getattr(df, op)

def time_stat_ops_series_std(self):
self.s.std()
def time_op(self, level, op):
self.df_func(level=level)


class stats_corr_spearman(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(1000, 30))

def time_stats_corr_spearman(self):
self.df.corr(method='spearman')
class SeriesOps(object):


class stats_rank2d_axis0_average(object):
goal_time = 0.2
param_names = ['op', 'use_bottleneck', 'dtype']
params = [['mean', 'sum', 'median', 'std'],
[True, False],
['float', 'int']]

def setup(self):
self.df = DataFrame(np.random.randn(5000, 50))

def time_stats_rank2d_axis0_average(self):
self.df.rank()


class stats_rank2d_axis1_average(object):
goal_time = 0.2
def setup(self, op, use_bottleneck, dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could put the bottleneck tests in a separate function / base class I think

s = pd.Series(np.random.randn(100000)).astype(dtype)
try:
pd.options.compute.use_bottleneck = use_bottleneck
except:
from pandas.core import nanops
nanops._USE_BOTTLENECK = use_bottleneck
self.s_func = getattr(s, op)

def setup(self):
self.df = DataFrame(np.random.randn(5000, 50))
def time_op(self, op, use_bottleneck, dtype):
self.s_func()

def time_stats_rank2d_axis1_average(self):
self.df.rank(1)

class SeriesMultiIndexOps(object):

class stats_rank_average(object):
goal_time = 0.2
params = ([0, 1, [0, 1]], ['mean', 'sum', 'median'])
param_names = ['level', 'op']

def setup(self):
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
self.s = Series(self.values)

def time_stats_rank_average(self):
self.s.rank()


class stats_rank_average_int(object):
goal_time = 0.2
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
labels = [np.arange(10).repeat(10000),
np.tile(np.arange(100).repeat(100), 10),
np.tile(np.tile(np.arange(100), 100), 10)]
index = pd.MultiIndex(levels=levels, labels=labels)
s = pd.Series(np.random.randn(len(index)), index=index)
self.s_func = getattr(s, op)

def setup(self):
self.values = np.random.randint(0, 100000, size=200000)
self.s = Series(self.values)
def time_op(self, level, op):
self.s_func(level=level)

def time_stats_rank_average_int(self):
self.s.rank()

class Rank(object):

class stats_rank_pct_average(object):
goal_time = 0.2
params = [['DataFrame', 'Series'], [True, False]]
param_names = ['constructor', 'pct']

def setup(self):
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
self.s = Series(self.values)

def time_stats_rank_pct_average(self):
self.s.rank(pct=True)

def setup(self, constructor, pct):
values = np.random.randn(10**5)
self.data = getattr(pd, constructor)(values)

class stats_rank_pct_average_old(object):
goal_time = 0.2
def time_rank(self, constructor, pct):
self.data.rank(pct=pct)

def setup(self):
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
self.s = Series(self.values)
def time_average_old(self, constructor, pct):
self.data.rank(pct=pct) / len(self.data)

def time_stats_rank_pct_average_old(self):
(self.s.rank() / len(self.s))

class Correlation(object):

class stats_rolling_mean(object):
goal_time = 0.2
params = ['spearman', 'kendall', 'pearson']
param_names = ['method']

def setup(self):
self.arr = np.random.randn(100000)
self.win = 100

def time_rolling_mean(self):
rolling_mean(self.arr, self.win)

def time_rolling_median(self):
rolling_median(self.arr, self.win)

def time_rolling_min(self):
rolling_min(self.arr, self.win)

def time_rolling_max(self):
rolling_max(self.arr, self.win)

def time_rolling_sum(self):
rolling_sum(self.arr, self.win)

def time_rolling_std(self):
rolling_std(self.arr, self.win)

def time_rolling_var(self):
rolling_var(self.arr, self.win)

def time_rolling_skew(self):
rolling_skew(self.arr, self.win)
def setup(self, method):
self.df = pd.DataFrame(np.random.randn(1000, 30))

def time_rolling_kurt(self):
rolling_kurt(self.arr, self.win)
def time_corr(self, method):
self.df.corr(method=method)