Skip to content

CLN: ASV inference benchmark #18759

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 18, 2017
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 75 additions & 78 deletions asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
@@ -1,117 +1,114 @@
from .pandas_vb_common import *
import pandas as pd
import numpy as np
import pandas.util.testing as tm
import pandas._libs.lib as lib
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this is not backward compat before 0.20., but I think ok for now.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you recall what this import would be pre 0.20?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from pandas import lib (this is in fact what pandas/lib.py does now, but that will get blown away in 0.22

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can import lib from pandas_vb_common.py, the back compat is handled there

from pandas import DataFrame, Series, to_numeric

from .pandas_vb_common import setup # noqa

class DtypeInfer(object):
goal_time = 0.2

class NumericInferOps(object):
# from GH 7332
goal_time = 0.2
params = ['int64', 'int32', 'uint32', 'float32', 'float64']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add uint64. you can add int16, int8, uint16, uint8 as well to cover the bases

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we may want to define these numeric dtypes elsewhere and import here (so we are consistent across the asv),

and prob for datetimelike as well

param_names = ['dtype']

def setup(self, dtype):
N = 5 * 10**5
self.df = DataFrame({'A': np.arange(N).astype(dtype),
'B': np.arange(N).astype(dtype)})

def setup(self):
self.N = 500000
self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'),
B=np.arange(self.N, dtype='int64')))
self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'),
B=np.arange(self.N, dtype='int32')))
self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'),
B=np.arange(self.N, dtype='uint32')))
self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'),
B=np.arange(self.N, dtype='float64')))
self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'),
B=np.arange(self.N, dtype='float32')))
self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'),
B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms')))
self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']),
B=self.df_datetime64['B']))
def time_add(self, dtype):
self.df['A'] + self.df['B']

def time_int64(self):
(self.df_int64['A'] + self.df_int64['B'])
def time_subtract(self, dtype):
self.df['A'] - self.df['B']

def time_int32(self):
(self.df_int32['A'] + self.df_int32['B'])
def time_multiply(self, dtype):
self.df['A'] * self.df['B']

def time_uint32(self):
(self.df_uint32['A'] + self.df_uint32['B'])
def time_divide(self, dtype):
self.df['A'] / self.df['B']

def time_float64(self):
(self.df_float64['A'] + self.df_float64['B'])
def time_modulo(self, dtype):
self.df['A'] % self.df['B']

def time_float32(self):
(self.df_float32['A'] + self.df_float32['B'])

def time_datetime64(self):
(self.df_datetime64['A'] - self.df_datetime64['B'])
class DateInferOps(object):
# from GH 7332
goal_time = 0.2

def setup_cache(self):
N = 5 * 10**5
df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')})
df['timedelta'] = df['datetime64'] - df['datetime64']
return df

def time_timedelta64_1(self):
(self.df_timedelta64['A'] + self.df_timedelta64['B'])
def time_subtract_datetimes(self, df):
df['datetime64'] - df['datetime64']

def time_timedelta64_2(self):
(self.df_timedelta64['A'] + self.df_timedelta64['A'])
def time_timedelta_plus_datetime(self, df):
df['timedelta'] + df['datetime64']

def time_add_timedeltas(self, df):
df['timedelta'] + df['timedelta']


class ToNumeric(object):

class to_numeric(object):
goal_time = 0.2
params = ['ignore', 'coerce']
param_names = ['errors']

def setup(self):
self.n = 10000
self.float = Series(np.random.randn(self.n * 100))
def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
self.numstr = self.float.astype('str')
self.str = Series(tm.makeStringIndex(self.n))

def time_from_float(self):
pd.to_numeric(self.float)
self.str = Series(tm.makeStringIndex(N))

def time_from_numeric_str(self):
pd.to_numeric(self.numstr)
def time_from_float(self, errors):
to_numeric(self.float, errors=errors)

def time_from_str_ignore(self):
pd.to_numeric(self.str, errors='ignore')
def time_from_numeric_str(self, errors):
to_numeric(self.numstr, errors=errors)

def time_from_str_coerce(self):
pd.to_numeric(self.str, errors='coerce')
def time_from_str(self, errors):
to_numeric(self.str, errors=errors)


class to_numeric_downcast(object):
class ToNumericDowncast(object):

param_names = ['dtype', 'downcast']
params = [['string-float', 'string-int', 'string-nint', 'datetime64',
'int-list', 'int32'],
[None, 'integer', 'signed', 'unsigned', 'float']]

N = 500000
N2 = int(N / 2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was for a good reason, you can't use floats to multiply lists (I find it a bit strange that this didn't fail for you)


data_dict = {
'string-int': (['1'] * N2) + ([2] * N2),
'string-nint': (['-1'] * N2) + ([2] * N2),
'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
dtype='datetime64[D]'), N),
'string-float': (['1.1'] * N2) + ([2] * N2),
'int-list': ([1] * N2) + ([2] * N2),
'int32': np.repeat(np.int32(1), N)
}
N2 = N / 2

data_dict = {'string-int': ['1'] * N2 + [2] * N2,
'string-nint': ['-1'] * N2 + [2] * N2,
'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
dtype='datetime64[D]'), N),
'string-float': ['1.1'] * N2 + [2] * N2,
'int-list': [1] * N2 + [2] * N2,
'int32': np.repeat(np.int32(1), N)}

def setup(self, dtype, downcast):
self.data = self.data_dict[dtype]

def time_downcast(self, dtype, downcast):
pd.to_numeric(self.data, downcast=downcast)
to_numeric(self.data, downcast=downcast)


class MaybeConvertNumeric(object):

def setup(self):
n = 1000000
arr = np.repeat([2**63], n)
arr = arr + np.arange(n).astype('uint64')
arr = np.array([arr[i] if i%2 == 0 else
str(arr[i]) for i in range(n)],
dtype=object)

arr[-1] = -1
self.data = arr
self.na_values = set()

def time_convert(self):
lib.maybe_convert_numeric(self.data, self.na_values,
coerce_numeric=False)
def setup_cache(self):
N = 10**6
arr = np.repeat([2**63], N) + np.arange(N).astype('uint64')
data = arr.astype(object)
data[1::2] = arr[1::2].astype(str)
data[-1] = -1
return data

def time_convert(self, data):
lib.maybe_convert_numeric(data, set(), coerce_numeric=False)