From d9fc6f483a6cf3cbc63626c87e64b73e9fc25a83 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 12 Dec 2017 22:21:55 -0800 Subject: [PATCH 1/3] CLN: ASV inference benchmark --- asv_bench/benchmarks/inference.py | 153 +++++++++++++++--------------- 1 file changed, 75 insertions(+), 78 deletions(-) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index dc1d6de73f8ae..363b38229197d 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,77 +1,81 @@ -from .pandas_vb_common import * -import pandas as pd +import numpy as np +import pandas.util.testing as tm +import pandas._libs.lib as lib +from pandas import DataFrame, Series, to_numeric +from .pandas_vb_common import setup # noqa -class DtypeInfer(object): - goal_time = 0.2 +class NumericInferOps(object): # from GH 7332 + goal_time = 0.2 + params = ['int64', 'int32', 'uint32', 'float32', 'float64'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 5 * 10**5 + self.df = DataFrame({'A': np.arange(N).astype(dtype), + 'B': np.arange(N).astype(dtype)}) - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), - B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), - B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), - B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), - B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), - B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), - B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), - B=self.df_datetime64['B'])) + def time_add(self, dtype): + self.df['A'] + self.df['B'] - def time_int64(self): - (self.df_int64['A'] + self.df_int64['B']) + def time_subtract(self, dtype): + self.df['A'] - self.df['B'] - def time_int32(self): - (self.df_int32['A'] + self.df_int32['B']) + def time_multiply(self, dtype): + self.df['A'] * self.df['B'] - def time_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) + def time_divide(self, dtype): + self.df['A'] / self.df['B'] - def time_float64(self): - (self.df_float64['A'] + self.df_float64['B']) + def time_modulo(self, dtype): + self.df['A'] % self.df['B'] - def time_float32(self): - (self.df_float32['A'] + self.df_float32['B']) - def time_datetime64(self): - (self.df_datetime64['A'] - self.df_datetime64['B']) +class DateInferOps(object): + # from GH 7332 + goal_time = 0.2 + + def setup_cache(self): + N = 5 * 10**5 + df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')}) + df['timedelta'] = df['datetime64'] - df['datetime64'] + return df - def time_timedelta64_1(self): - (self.df_timedelta64['A'] + self.df_timedelta64['B']) + def time_subtract_datetimes(self, df): + df['datetime64'] - df['datetime64'] - def time_timedelta64_2(self): - (self.df_timedelta64['A'] + self.df_timedelta64['A']) + def time_timedelta_plus_datetime(self, df): + df['timedelta'] + df['datetime64'] + def time_add_timedeltas(self, df): + df['timedelta'] + df['timedelta'] + + +class ToNumeric(object): -class to_numeric(object): goal_time = 0.2 + params = ['ignore', 'coerce'] + param_names = ['errors'] - def setup(self): - self.n = 10000 - self.float = Series(np.random.randn(self.n * 100)) + def setup(self, errors): + N = 10000 + self.float = Series(np.random.randn(N)) self.numstr = self.float.astype('str') - self.str = Series(tm.makeStringIndex(self.n)) - - def time_from_float(self): - pd.to_numeric(self.float) + self.str = Series(tm.makeStringIndex(N)) - def time_from_numeric_str(self): - pd.to_numeric(self.numstr) + def time_from_float(self, errors): + to_numeric(self.float, errors=errors) - def time_from_str_ignore(self): - pd.to_numeric(self.str, errors='ignore') + def time_from_numeric_str(self, errors): + to_numeric(self.numstr, errors=errors) - def time_from_str_coerce(self): - pd.to_numeric(self.str, errors='coerce') + def time_from_str(self, errors): + to_numeric(self.str, errors=errors) -class to_numeric_downcast(object): +class ToNumericDowncast(object): param_names = ['dtype', 'downcast'] params = [['string-float', 'string-int', 'string-nint', 'datetime64', @@ -79,39 +83,32 @@ class to_numeric_downcast(object): [None, 'integer', 'signed', 'unsigned', 'float']] N = 500000 - N2 = int(N / 2) - - data_dict = { - 'string-int': (['1'] * N2) + ([2] * N2), - 'string-nint': (['-1'] * N2) + ([2] * N2), - 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], - dtype='datetime64[D]'), N), - 'string-float': (['1.1'] * N2) + ([2] * N2), - 'int-list': ([1] * N2) + ([2] * N2), - 'int32': np.repeat(np.int32(1), N) - } + N2 = N / 2 + + data_dict = {'string-int': ['1'] * N2 + [2] * N2, + 'string-nint': ['-1'] * N2 + [2] * N2, + 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], + dtype='datetime64[D]'), N), + 'string-float': ['1.1'] * N2 + [2] * N2, + 'int-list': [1] * N2 + [2] * N2, + 'int32': np.repeat(np.int32(1), N)} def setup(self, dtype, downcast): self.data = self.data_dict[dtype] def time_downcast(self, dtype, downcast): - pd.to_numeric(self.data, downcast=downcast) + to_numeric(self.data, downcast=downcast) class MaybeConvertNumeric(object): - def setup(self): - n = 1000000 - arr = np.repeat([2**63], n) - arr = arr + np.arange(n).astype('uint64') - arr = np.array([arr[i] if i%2 == 0 else - str(arr[i]) for i in range(n)], - dtype=object) - - arr[-1] = -1 - self.data = arr - self.na_values = set() - - def time_convert(self): - lib.maybe_convert_numeric(self.data, self.na_values, - coerce_numeric=False) + def setup_cache(self): + N = 10**6 + arr = np.repeat([2**63], N) + np.arange(N).astype('uint64') + data = arr.astype(object) + data[1::2] = arr[1::2].astype(str) + data[-1] = -1 + return data + + def time_convert(self, data): + lib.maybe_convert_numeric(data, set(), coerce_numeric=False) From f33fbca76034e991e5835ea93aada93927fb70fb Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 14 Dec 2017 21:06:48 -0800 Subject: [PATCH 2/3] Address comments --- asv_bench/benchmarks/inference.py | 11 +++++++---- asv_bench/benchmarks/pandas_vb_common.py | 4 ++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 363b38229197d..5576d6cd45f6f 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,15 +1,18 @@ import numpy as np import pandas.util.testing as tm -import pandas._libs.lib as lib +try: + import pandas._libs.lib as lib +except ImportError: + import pandas.lib as lib from pandas import DataFrame, Series, to_numeric -from .pandas_vb_common import setup # noqa +from .pandas_vb_common import numeric_dtypes, setup # noqa class NumericInferOps(object): # from GH 7332 goal_time = 0.2 - params = ['int64', 'int32', 'uint32', 'float32', 'float64'] + params = numeric_dtypes param_names = ['dtype'] def setup(self, dtype): @@ -83,7 +86,7 @@ class ToNumericDowncast(object): [None, 'integer', 'signed', 'unsigned', 'float']] N = 500000 - N2 = N / 2 + N2 = int(N / 2) data_dict = {'string-int': ['1'] * N2 + [2] * N2, 'string-nint': ['-1'] * N2 + [2] * N2, diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index b7040bfdb9397..884ce442a3a1c 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -14,6 +14,10 @@ except ImportError: pass +numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, + np.float64, np.int16, np.int8, np.uint16, np.uint8] +datetime_dtypes = [np.datetime64, np.timedelta64] + # This function just needs to be imported into each benchmark file in order to # sets up the random seed before each function. # http://asv.readthedocs.io/en/latest/writing_benchmarks.html From 8c2f3f63d3d0399ab184525e1f3820bc47c9df81 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 16 Dec 2017 11:35:46 -0800 Subject: [PATCH 3/3] import lib from common --- asv_bench/benchmarks/inference.py | 6 +----- asv_bench/benchmarks/pandas_vb_common.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 5576d6cd45f6f..16d9e7cd73cbb 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,12 +1,8 @@ import numpy as np import pandas.util.testing as tm -try: - import pandas._libs.lib as lib -except ImportError: - import pandas.lib as lib from pandas import DataFrame, Series, to_numeric -from .pandas_vb_common import numeric_dtypes, setup # noqa +from .pandas_vb_common import numeric_dtypes, lib, setup # noqa class NumericInferOps(object): diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 884ce442a3a1c..4de87ddcb0683 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -43,7 +43,7 @@ def remove(self, f): def teardown(self, *args, **kwargs): self.remove(self.fname) -# try em until it works! +# Compatability import for lib for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']: try: lib = import_module(imp)