Skip to content

CLN: ASV stat_ops #19049

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 6, 2018
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 78 additions & 161 deletions asv_bench/benchmarks/stat_ops.py
Original file line number Diff line number Diff line change
@@ -1,205 +1,122 @@
from .pandas_vb_common import *
import numpy as np
import pandas as pd

from .pandas_vb_common import setup # noqa

def _set_use_bottleneck_False():
try:
pd.options.compute.use_bottleneck = False
except:
from pandas.core import nanops
nanops._USE_BOTTLENECK = False

class Bottleneck(object):

class FrameOps(object):
goal_time = 0.2
params = ([True, False], ['DataFrame', 'Series'])
param_names = ['use_bottleneck', 'constructor']

param_names = ['op', 'use_bottleneck', 'dtype', 'axis']
params = [['mean', 'sum', 'median'],
[True, False],
['float', 'int'],
[0, 1]]

def setup(self, op, use_bottleneck, dtype, axis):
if dtype == 'float':
self.df = DataFrame(np.random.randn(100000, 4))
elif dtype == 'int':
self.df = DataFrame(np.random.randint(1000, size=(100000, 4)))

if not use_bottleneck:
_set_use_bottleneck_False()
def setup(self, use_bottleneck, constructor):
values = np.random.randn(10**6)
self.data = getattr(pd, constructor)(values)
try:
pd.options.compute.use_bottleneck = use_bottleneck
except:
from pandas.core import nanops
nanops._USE_BOTTLENECK = use_bottleneck

self.func = getattr(self.df, op)
def time_mean(self, use_bottleneck, constructor):
self.data.mean()

def time_op(self, op, use_bottleneck, dtype, axis):
self.func(axis=axis)

class FrameOps(object):

class stat_ops_level_frame_sum(object):
goal_time = 0.2
param_names = ['op', 'dtype', 'axis']
params = [['mean', 'sum', 'median', 'std'],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and use bottleneck here, also expand these to all of the stat ops (min, max, var, kurt, etc)

['float', 'int'],
[0, 1]]

def setup(self):
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])

def time_stat_ops_level_frame_sum(self):
self.df.sum(level=1)


class stat_ops_level_frame_sum_multiple(object):
goal_time = 0.2
def setup(self, op, dtype, axis):
df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
self.df_func = getattr(df, op)

def setup(self):
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
def time_op(self, op, dtype, axis):
self.df_func(axis=axis)

def time_stat_ops_level_frame_sum_multiple(self):
self.df.sum(level=[0, 1])

class FrameMultiIndexOps(object):

class stat_ops_level_series_sum(object):
goal_time = 0.2
params = ([0, 1, [0, 1]], ['mean', 'sum', 'median'])
param_names = ['level', 'op']

def setup(self):
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
labels = [np.arange(10).repeat(10000),
np.tile(np.arange(100).repeat(100), 10),
np.tile(np.tile(np.arange(100), 100), 10)]
index = pd.MultiIndex(levels=levels, labels=labels)
df = pd.DataFrame(np.random.randn(len(index), 4), index=index)
self.df_func = getattr(df, op)

def time_stat_ops_level_series_sum(self):
self.df[1].sum(level=1)
def time_op(self, level, op):
self.df_func(level=level)


class stat_ops_level_series_sum_multiple(object):
goal_time = 0.2

def setup(self):
self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])

def time_stat_ops_level_series_sum_multiple(self):
self.df[1].sum(level=[0, 1])
class SeriesOps(object):


class stat_ops_series_std(object):
goal_time = 0.2
param_names = ['op', 'dtype']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what I meant about bottleneck was make it a parameter in ops, e.g. here

params = [['mean', 'sum', 'median', 'std'],
['float', 'int']]

def setup(self):
self.s = Series(np.random.randn(100000), index=np.arange(100000))
self.s[::2] = np.nan

def time_stat_ops_series_std(self):
self.s.std()
def setup(self, op, dtype):
s = pd.Series(np.random.randn(100000)).astype(dtype)
self.s_func = getattr(s, op)

def time_op(self, op, dtype):
self.s_func()

class stats_corr_spearman(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(1000, 30))
class SeriesMultiIndexOps(object):

def time_stats_corr_spearman(self):
self.df.corr(method='spearman')


class stats_rank2d_axis0_average(object):
goal_time = 0.2
params = ([0, 1, [0, 1]], ['mean', 'sum', 'median'])
param_names = ['level', 'op']

def setup(self):
self.df = DataFrame(np.random.randn(5000, 50))

def time_stats_rank2d_axis0_average(self):
self.df.rank()
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
labels = [np.arange(10).repeat(10000),
np.tile(np.arange(100).repeat(100), 10),
np.tile(np.tile(np.arange(100), 100), 10)]
index = pd.MultiIndex(levels=levels, labels=labels)
s = pd.Series(np.random.randn(len(index)), index=index)
self.s_func = getattr(s, op)

def time_op(self, level, op):
self.s_func(level=level)

class stats_rank2d_axis1_average(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(5000, 50))

def time_stats_rank2d_axis1_average(self):
self.df.rank(1)
class Rank(object):


class stats_rank_average(object):
goal_time = 0.2

def setup(self):
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
self.s = Series(self.values)

def time_stats_rank_average(self):
self.s.rank()


class stats_rank_average_int(object):
goal_time = 0.2

def setup(self):
self.values = np.random.randint(0, 100000, size=200000)
self.s = Series(self.values)

def time_stats_rank_average_int(self):
self.s.rank()


class stats_rank_pct_average(object):
goal_time = 0.2
params = [['DataFrame', 'Series'], [True, False]]
param_names = ['constructor', 'pct']

def setup(self):
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
self.s = Series(self.values)
def setup(self, constructor, pct):
values = np.random.randn(10**5)
self.data = getattr(pd, constructor)(values)

def time_stats_rank_pct_average(self):
self.s.rank(pct=True)
def time_rank(self, constructor, pct):
self.data.rank(pct=pct)

def time_average_old(self, constructor, pct):
self.data.rank(pct=pct) / len(self.data)

class stats_rank_pct_average_old(object):
goal_time = 0.2

def setup(self):
self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)])
self.s = Series(self.values)

def time_stats_rank_pct_average_old(self):
(self.s.rank() / len(self.s))

class Correlation(object):

class stats_rolling_mean(object):
goal_time = 0.2
params = ['spearman', 'kendall', 'pearson']
param_names = ['method']

def setup(self):
self.arr = np.random.randn(100000)
self.win = 100

def time_rolling_mean(self):
rolling_mean(self.arr, self.win)

def time_rolling_median(self):
rolling_median(self.arr, self.win)

def time_rolling_min(self):
rolling_min(self.arr, self.win)

def time_rolling_max(self):
rolling_max(self.arr, self.win)

def time_rolling_sum(self):
rolling_sum(self.arr, self.win)

def time_rolling_std(self):
rolling_std(self.arr, self.win)

def time_rolling_var(self):
rolling_var(self.arr, self.win)

def time_rolling_skew(self):
rolling_skew(self.arr, self.win)
def setup(self, method):
self.df = pd.DataFrame(np.random.randn(1000, 30))

def time_rolling_kurt(self):
rolling_kurt(self.arr, self.win)
def time_corr(self, method):
self.df.corr(method=method)