diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index dc1d6de73f8ae..16d9e7cd73cbb 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,77 +1,80 @@ -from .pandas_vb_common import * -import pandas as pd +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, to_numeric +from .pandas_vb_common import numeric_dtypes, lib, setup # noqa -class DtypeInfer(object): - goal_time = 0.2 +class NumericInferOps(object): # from GH 7332 + goal_time = 0.2 + params = numeric_dtypes + param_names = ['dtype'] + + def setup(self, dtype): + N = 5 * 10**5 + self.df = DataFrame({'A': np.arange(N).astype(dtype), + 'B': np.arange(N).astype(dtype)}) + + def time_add(self, dtype): + self.df['A'] + self.df['B'] + + def time_subtract(self, dtype): + self.df['A'] - self.df['B'] - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), - B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), - B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), - B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), - B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), - B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), - B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), - B=self.df_datetime64['B'])) + def time_multiply(self, dtype): + self.df['A'] * self.df['B'] - def time_int64(self): - (self.df_int64['A'] + self.df_int64['B']) + def time_divide(self, dtype): + self.df['A'] / self.df['B'] - def time_int32(self): - (self.df_int32['A'] + self.df_int32['B']) + def time_modulo(self, dtype): + self.df['A'] % self.df['B'] - def time_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) - def time_float64(self): - (self.df_float64['A'] + self.df_float64['B']) +class DateInferOps(object): + # from GH 7332 + goal_time = 0.2 + + def setup_cache(self): + N = 5 * 10**5 + df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')}) + df['timedelta'] = df['datetime64'] - df['datetime64'] + return df - def time_float32(self): - (self.df_float32['A'] + self.df_float32['B']) + def time_subtract_datetimes(self, df): + df['datetime64'] - df['datetime64'] - def time_datetime64(self): - (self.df_datetime64['A'] - self.df_datetime64['B']) + def time_timedelta_plus_datetime(self, df): + df['timedelta'] + df['datetime64'] - def time_timedelta64_1(self): - (self.df_timedelta64['A'] + self.df_timedelta64['B']) + def time_add_timedeltas(self, df): + df['timedelta'] + df['timedelta'] - def time_timedelta64_2(self): - (self.df_timedelta64['A'] + self.df_timedelta64['A']) +class ToNumeric(object): -class to_numeric(object): goal_time = 0.2 + params = ['ignore', 'coerce'] + param_names = ['errors'] - def setup(self): - self.n = 10000 - self.float = Series(np.random.randn(self.n * 100)) + def setup(self, errors): + N = 10000 + self.float = Series(np.random.randn(N)) self.numstr = self.float.astype('str') - self.str = Series(tm.makeStringIndex(self.n)) + self.str = Series(tm.makeStringIndex(N)) - def time_from_float(self): - pd.to_numeric(self.float) + def time_from_float(self, errors): + to_numeric(self.float, errors=errors) - def time_from_numeric_str(self): - pd.to_numeric(self.numstr) + def time_from_numeric_str(self, errors): + to_numeric(self.numstr, errors=errors) - def time_from_str_ignore(self): - pd.to_numeric(self.str, errors='ignore') + def time_from_str(self, errors): + to_numeric(self.str, errors=errors) - def time_from_str_coerce(self): - pd.to_numeric(self.str, errors='coerce') - -class to_numeric_downcast(object): +class ToNumericDowncast(object): param_names = ['dtype', 'downcast'] params = [['string-float', 'string-int', 'string-nint', 'datetime64', @@ -81,37 +84,30 @@ class to_numeric_downcast(object): N = 500000 N2 = int(N / 2) - data_dict = { - 'string-int': (['1'] * N2) + ([2] * N2), - 'string-nint': (['-1'] * N2) + ([2] * N2), - 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], - dtype='datetime64[D]'), N), - 'string-float': (['1.1'] * N2) + ([2] * N2), - 'int-list': ([1] * N2) + ([2] * N2), - 'int32': np.repeat(np.int32(1), N) - } + data_dict = {'string-int': ['1'] * N2 + [2] * N2, + 'string-nint': ['-1'] * N2 + [2] * N2, + 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], + dtype='datetime64[D]'), N), + 'string-float': ['1.1'] * N2 + [2] * N2, + 'int-list': [1] * N2 + [2] * N2, + 'int32': np.repeat(np.int32(1), N)} def setup(self, dtype, downcast): self.data = self.data_dict[dtype] def time_downcast(self, dtype, downcast): - pd.to_numeric(self.data, downcast=downcast) + to_numeric(self.data, downcast=downcast) class MaybeConvertNumeric(object): - def setup(self): - n = 1000000 - arr = np.repeat([2**63], n) - arr = arr + np.arange(n).astype('uint64') - arr = np.array([arr[i] if i%2 == 0 else - str(arr[i]) for i in range(n)], - dtype=object) - - arr[-1] = -1 - self.data = arr - self.na_values = set() - - def time_convert(self): - lib.maybe_convert_numeric(self.data, self.na_values, - coerce_numeric=False) + def setup_cache(self): + N = 10**6 + arr = np.repeat([2**63], N) + np.arange(N).astype('uint64') + data = arr.astype(object) + data[1::2] = arr[1::2].astype(str) + data[-1] = -1 + return data + + def time_convert(self, data): + lib.maybe_convert_numeric(data, set(), coerce_numeric=False) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index b7040bfdb9397..4de87ddcb0683 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -14,6 +14,10 @@ except ImportError: pass +numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, + np.float64, np.int16, np.int8, np.uint16, np.uint8] +datetime_dtypes = [np.datetime64, np.timedelta64] + # This function just needs to be imported into each benchmark file in order to # sets up the random seed before each function. # http://asv.readthedocs.io/en/latest/writing_benchmarks.html @@ -39,7 +43,7 @@ def remove(self, f): def teardown(self, *args, **kwargs): self.remove(self.fname) -# try em until it works! +# Compatability import for lib for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']: try: lib = import_module(imp)