diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 7b4fec0090701..c0d24afae4219 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -1,27 +1,29 @@ import os -from pandas import * -import pandas as pd -from numpy.random import randn -from numpy.random import randint -import pandas.util.testing as tm -import random -import numpy as np -import threading from importlib import import_module +import numpy as np try: - from pandas.compat import range + from pandas import Panel except ImportError: - pass + from pandas import WidePanel as Panel # noqa + +# Compatibility import for lib +for imp in ['pandas._libs.lib', 'pandas.lib']: + try: + lib = import_module(imp) + break + except: + pass numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, np.float64, np.int16, np.int8, np.uint16, np.uint8] datetime_dtypes = [np.datetime64, np.timedelta64] -# This function just needs to be imported into each benchmark file in order to -# sets up the random seed before each function. -# http://asv.readthedocs.io/en/latest/writing_benchmarks.html + def setup(*args, **kwargs): + # This function just needs to be imported into each benchmark file to + # set up the random seed before each function. + # http://asv.readthedocs.io/en/latest/writing_benchmarks.html np.random.seed(1234) @@ -42,22 +44,3 @@ def remove(self, f): def teardown(self, *args, **kwargs): self.remove(self.fname) - -# Compatibility import for lib -for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']: - try: - lib = import_module(imp) - break - except: - pass - -try: - Panel = Panel -except Exception: - Panel = WidePanel - -# didn't add to namespace until later -try: - from pandas.core.index import MultiIndex -except ImportError: - pass diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 948d4b92a5a57..4435327e1eb38 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,119 +1,144 @@ -from .pandas_vb_common import * -import string -import itertools as IT -import pandas.util.testing as testing +import numpy as np +from pandas import Series +import pandas.util.testing as tm -class StringMethods(object): - goal_time = 0.2 +class Methods(object): - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) + goal_time = 0.2 def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - self.s = self.make_series(string.ascii_uppercase, strlen=10, size=10000).str.join('|') + self.s = Series(tm.makeStringIndex(10**5)) def time_cat(self): - self.many.str.cat(sep=',') + self.s.str.cat(sep=',') def time_center(self): - self.many.str.center(100) - - def time_contains_few(self): - self.few.str.contains('matchthis') - - def time_contains_few_noregex(self): - self.few.str.contains('matchthis', regex=False) - - def time_contains_many(self): - self.many.str.contains('matchthis') - - def time_contains_many_noregex(self): - self.many.str.contains('matchthis', regex=False) + self.s.str.center(100) def time_count(self): - self.many.str.count('matchthis') + self.s.str.count('A') def time_endswith(self): - self.many.str.endswith('matchthis') + self.s.str.endswith('A') def time_extract(self): - self.many.str.extract('(\\w*)matchthis(\\w*)') + self.s.str.extract('(\\w*)A(\\w*)') def time_findall(self): - self.many.str.findall('[A-Z]+') + self.s.str.findall('[A-Z]+') def time_get(self): - self.many.str.get(0) - - def time_join_split(self): - self.many.str.join('--').str.split('--') - - def time_join_split_expand(self): - self.many.str.join('--').str.split('--', expand=True) + self.s.str.get(0) def time_len(self): - self.many.str.len() + self.s.str.len() def time_match(self): - self.many.str.match('mat..this') + self.s.str.match('A') def time_pad(self): - self.many.str.pad(100, side='both') - - def time_repeat(self): - self.many.str.repeat(list(IT.islice(IT.cycle(range(1, 4)), len(self.many)))) + self.s.str.pad(100, side='both') def time_replace(self): - self.many.str.replace('(matchthis)', '\x01\x01') + self.s.str.replace('A', '\x01\x01') def time_slice(self): - self.many.str.slice(5, 15, 2) + self.s.str.slice(5, 15, 2) def time_startswith(self): - self.many.str.startswith('matchthis') + self.s.str.startswith('A') def time_strip(self): - self.many.str.strip('matchthis') + self.s.str.strip('A') def time_rstrip(self): - self.many.str.rstrip('matchthis') + self.s.str.rstrip('A') def time_lstrip(self): - self.many.str.lstrip('matchthis') + self.s.str.lstrip('A') def time_title(self): - self.many.str.title() + self.s.str.title() def time_upper(self): - self.many.str.upper() + self.s.str.upper() def time_lower(self): - self.many.str.lower() + self.s.str.lower() + + +class Repeat(object): + + goal_time = 0.2 + params = ['int', 'array'] + param_names = ['repeats'] + + def setup(self, repeats): + N = 10**5 + self.s = Series(tm.makeStringIndex(N)) + repeat = {'int': 1, 'array': np.random.randint(1, 3, N)} + self.repeat = repeat[repeats] + + def time_repeat(self, repeats): + self.s.str.repeat(self.repeat) + + +class Contains(object): + + goal_time = 0.2 + params = [True, False] + param_names = ['regex'] + + def setup(self, regex): + self.s = Series(tm.makeStringIndex(10**5)) + + def time_contains(self, regex): + self.s.str.contains('A', regex=regex) + + +class Split(object): + + goal_time = 0.2 + params = [True, False] + param_names = ['expand'] + + def setup(self, expand): + self.s = Series(tm.makeStringIndex(10**5)).str.join('--') + + def time_split(self, expand): + self.s.str.split('--', expand=expand) + + +class Dummies(object): + + goal_time = 0.2 + + def setup(self): + self.s = Series(tm.makeStringIndex(10**5)).str.join('|') def time_get_dummies(self): self.s.str.get_dummies('|') -class StringEncode(object): +class Encode(object): + goal_time = 0.2 def setup(self): - self.ser = Series(testing.makeUnicodeIndex()) + self.ser = Series(tm.makeUnicodeIndex()) def time_encode_decode(self): self.ser.str.encode('utf-8').str.decode('utf-8') -class StringSlice(object): +class Slice(object): goal_time = 0.2 def setup(self): self.s = Series(['abcdefg', np.nan] * 500000) - def time_series_string_vector_slice(self): + def time_vector_slice(self): # GH 2602 self.s.str[:5] diff --git a/ci/lint.sh b/ci/lint.sh index b4eafcaf28e39..2031eaa8a1d5d 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -24,7 +24,7 @@ if [ "$LINT" ]; then echo "Linting setup.py DONE" echo "Linting asv_bench/benchmarks/" - flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/[ps]*.py --ignore=F811 + flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/*.py --ignore=F811 if [ $? -ne "0" ]; then RET=1 fi