Skip to content

CLN: ASV inference benchmark #18759

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 18, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 76 additions & 76 deletions asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,84 @@
from .pandas_vb_common import *
import pandas as pd
import numpy as np
import pandas.util.testing as tm
try:
import pandas._libs.lib as lib
except ImportError:
import pandas.lib as lib
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would keep this in pandas_vb_common, you can just import lib from there (as this can then be reused in multiple files)

from pandas import DataFrame, Series, to_numeric

from .pandas_vb_common import numeric_dtypes, setup # noqa

class DtypeInfer(object):
goal_time = 0.2

class NumericInferOps(object):
# from GH 7332
goal_time = 0.2
params = numeric_dtypes
param_names = ['dtype']

def setup(self, dtype):
N = 5 * 10**5
self.df = DataFrame({'A': np.arange(N).astype(dtype),
'B': np.arange(N).astype(dtype)})

def time_add(self, dtype):
self.df['A'] + self.df['B']

def time_subtract(self, dtype):
self.df['A'] - self.df['B']

def setup(self):
self.N = 500000
self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'),
B=np.arange(self.N, dtype='int64')))
self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'),
B=np.arange(self.N, dtype='int32')))
self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'),
B=np.arange(self.N, dtype='uint32')))
self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'),
B=np.arange(self.N, dtype='float64')))
self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'),
B=np.arange(self.N, dtype='float32')))
self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'),
B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']),
B=self.df_datetime64['B']))
def time_multiply(self, dtype):
self.df['A'] * self.df['B']

def time_int64(self):
(self.df_int64['A'] + self.df_int64['B'])
def time_divide(self, dtype):
self.df['A'] / self.df['B']

def time_int32(self):
(self.df_int32['A'] + self.df_int32['B'])
def time_modulo(self, dtype):
self.df['A'] % self.df['B']

def time_uint32(self):
(self.df_uint32['A'] + self.df_uint32['B'])

def time_float64(self):
(self.df_float64['A'] + self.df_float64['B'])
class DateInferOps(object):
# from GH 7332
goal_time = 0.2

def setup_cache(self):
N = 5 * 10**5
df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')})
df['timedelta'] = df['datetime64'] - df['datetime64']
return df

def time_float32(self):
(self.df_float32['A'] + self.df_float32['B'])
def time_subtract_datetimes(self, df):
df['datetime64'] - df['datetime64']

def time_datetime64(self):
(self.df_datetime64['A'] - self.df_datetime64['B'])
def time_timedelta_plus_datetime(self, df):
df['timedelta'] + df['datetime64']

def time_timedelta64_1(self):
(self.df_timedelta64['A'] + self.df_timedelta64['B'])
def time_add_timedeltas(self, df):
df['timedelta'] + df['timedelta']

def time_timedelta64_2(self):
(self.df_timedelta64['A'] + self.df_timedelta64['A'])

class ToNumeric(object):

class to_numeric(object):
goal_time = 0.2
params = ['ignore', 'coerce']
param_names = ['errors']

def setup(self):
self.n = 10000
self.float = Series(np.random.randn(self.n * 100))
def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
self.numstr = self.float.astype('str')
self.str = Series(tm.makeStringIndex(self.n))
self.str = Series(tm.makeStringIndex(N))

def time_from_float(self):
pd.to_numeric(self.float)
def time_from_float(self, errors):
to_numeric(self.float, errors=errors)

def time_from_numeric_str(self):
pd.to_numeric(self.numstr)
def time_from_numeric_str(self, errors):
to_numeric(self.numstr, errors=errors)

def time_from_str_ignore(self):
pd.to_numeric(self.str, errors='ignore')
def time_from_str(self, errors):
to_numeric(self.str, errors=errors)

def time_from_str_coerce(self):
pd.to_numeric(self.str, errors='coerce')


class to_numeric_downcast(object):
class ToNumericDowncast(object):

param_names = ['dtype', 'downcast']
params = [['string-float', 'string-int', 'string-nint', 'datetime64',
Expand All @@ -81,37 +88,30 @@ class to_numeric_downcast(object):
N = 500000
N2 = int(N / 2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was for a good reason, you can't use floats to multiply lists (I find it a bit strange that this didn't fail for you)


data_dict = {
'string-int': (['1'] * N2) + ([2] * N2),
'string-nint': (['-1'] * N2) + ([2] * N2),
'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
dtype='datetime64[D]'), N),
'string-float': (['1.1'] * N2) + ([2] * N2),
'int-list': ([1] * N2) + ([2] * N2),
'int32': np.repeat(np.int32(1), N)
}
data_dict = {'string-int': ['1'] * N2 + [2] * N2,
'string-nint': ['-1'] * N2 + [2] * N2,
'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
dtype='datetime64[D]'), N),
'string-float': ['1.1'] * N2 + [2] * N2,
'int-list': [1] * N2 + [2] * N2,
'int32': np.repeat(np.int32(1), N)}

def setup(self, dtype, downcast):
self.data = self.data_dict[dtype]

def time_downcast(self, dtype, downcast):
pd.to_numeric(self.data, downcast=downcast)
to_numeric(self.data, downcast=downcast)


class MaybeConvertNumeric(object):

def setup(self):
n = 1000000
arr = np.repeat([2**63], n)
arr = arr + np.arange(n).astype('uint64')
arr = np.array([arr[i] if i%2 == 0 else
str(arr[i]) for i in range(n)],
dtype=object)

arr[-1] = -1
self.data = arr
self.na_values = set()

def time_convert(self):
lib.maybe_convert_numeric(self.data, self.na_values,
coerce_numeric=False)
def setup_cache(self):
N = 10**6
arr = np.repeat([2**63], N) + np.arange(N).astype('uint64')
data = arr.astype(object)
data[1::2] = arr[1::2].astype(str)
data[-1] = -1
return data

def time_convert(self, data):
lib.maybe_convert_numeric(data, set(), coerce_numeric=False)
4 changes: 4 additions & 0 deletions asv_bench/benchmarks/pandas_vb_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
except ImportError:
pass

numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32,
np.float64, np.int16, np.int8, np.uint16, np.uint8]
datetime_dtypes = [np.datetime64, np.timedelta64]

# This function just needs to be imported into each benchmark file in order to
# sets up the random seed before each function.
# http://asv.readthedocs.io/en/latest/writing_benchmarks.html
Expand Down