Skip to content

Commit 2dcf668

Browse files
committed
CLN: ASV Algorithms
Try setup_cache Add base ASV class and test Hashing rework algorithms benchmarks improve algorithms benchmark Benchmarks working! cleanup
1 parent e6eac0b commit 2dcf668

File tree

1 file changed

+114
-77
lines changed

1 file changed

+114
-77
lines changed

asv_bench/benchmarks/algorithms.py

+114-77
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from importlib import import_module
22

33
import numpy as np
4-
54
import pandas as pd
65
from pandas.util import testing as tm
6+
from pandas.core.algorithms import checked_add_with_arr
77

88
for imp in ['pandas.util', 'pandas.tools.hashing']:
99
try:
@@ -12,113 +12,150 @@
1212
except:
1313
pass
1414

15-
class Algorithms(object):
15+
16+
class Factorize(object):
17+
18+
goal_time = 0.2
19+
20+
def setup(self):
21+
N = 10**5
22+
np.random.seed(1234)
23+
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
24+
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
25+
self.string_idx = tm.makeStringIndex(N)
26+
27+
def time_factorize_int(self):
28+
self.int_idx.factorize()
29+
30+
def time_factorize_float(self):
31+
self.float_idx.factorize()
32+
33+
def time_factorize_string(self):
34+
self.string_idx.factorize()
35+
36+
37+
class Duplicated(object):
38+
1639
goal_time = 0.2
1740

1841
def setup(self):
19-
N = 100000
42+
N = 10**5
2043
np.random.seed(1234)
44+
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
45+
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
46+
47+
def time_duplicated_int(self):
48+
self.int_idx.duplicated()
49+
50+
def time_duplicated_float(self):
51+
self.float_idx.duplicated()
2152

22-
self.int_unique = pd.Int64Index(np.arange(N * 5))
53+
54+
class DuplicatedUniqueIndex(object):
55+
56+
goal_time = 0.2
57+
58+
def setup(self):
59+
N = 10**5
60+
self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
2361
# cache is_unique
24-
self.int_unique.is_unique
62+
self.idx_int_dup.is_unique
2563

26-
self.int = pd.Int64Index(np.arange(N).repeat(5))
27-
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
64+
def time_duplicated_unique_int(self):
65+
self.idx_int_dup.duplicated()
2866

29-
# Convenience naming.
30-
self.checked_add = pd.core.algorithms.checked_add_with_arr
3167

32-
self.arr = np.arange(1000000)
33-
self.arrpos = np.arange(1000000)
34-
self.arrneg = np.arange(-1000000, 0)
35-
self.arrmixed = np.array([1, -1]).repeat(500000)
36-
self.strings = tm.makeStringIndex(100000)
68+
class Match(object):
3769

38-
self.arr_nan = np.random.choice([True, False], size=1000000)
39-
self.arrmixed_nan = np.random.choice([True, False], size=1000000)
70+
goal_time = 0.2
4071

41-
# match
72+
def setup(self):
73+
np.random.seed(1234)
4274
self.uniques = tm.makeStringIndex(1000).values
4375
self.all = self.uniques.repeat(10)
4476

45-
def time_factorize_string(self):
46-
self.strings.factorize()
47-
48-
def time_factorize_int(self):
49-
self.int.factorize()
77+
def time_match_string(self):
78+
pd.match(self.all, self.uniques)
5079

51-
def time_factorize_float(self):
52-
self.int.factorize()
5380

54-
def time_duplicated_int_unique(self):
55-
self.int_unique.duplicated()
81+
class AddOverflowScalar(object):
5682

57-
def time_duplicated_int(self):
58-
self.int.duplicated()
83+
goal_time = 0.2
5984

60-
def time_duplicated_float(self):
61-
self.float.duplicated()
85+
params = [1, -1, 0]
6286

63-
def time_match_strings(self):
64-
pd.match(self.all, self.uniques)
87+
def setup(self, scalar):
88+
N = 10**6
89+
self.arr = np.arange(N)
6590

66-
def time_add_overflow_pos_scalar(self):
67-
self.checked_add(self.arr, 1)
91+
def time_add_overflow_scalar(self, scalar):
92+
checked_add_with_arr(self.arr, scalar)
6893

69-
def time_add_overflow_neg_scalar(self):
70-
self.checked_add(self.arr, -1)
7194

72-
def time_add_overflow_zero_scalar(self):
73-
self.checked_add(self.arr, 0)
95+
class AddOverflowArray(object):
7496

75-
def time_add_overflow_pos_arr(self):
76-
self.checked_add(self.arr, self.arrpos)
97+
goal_time = 0.2
7798

78-
def time_add_overflow_neg_arr(self):
79-
self.checked_add(self.arr, self.arrneg)
99+
def setup(self):
100+
np.random.seed(1234)
101+
N = 10**6
102+
self.arr = np.arange(N)
103+
self.arr_rev = np.arange(-N, 0)
104+
self.arr_mixed = np.array([1, -1]).repeat(N / 2)
105+
self.arr_nan_1 = np.random.choice([True, False], size=N)
106+
self.arr_nan_2 = np.random.choice([True, False], size=N)
80107

81-
def time_add_overflow_mixed_arr(self):
82-
self.checked_add(self.arr, self.arrmixed)
108+
def time_add_overflow_arr_rev(self):
109+
checked_add_with_arr(self.arr, self.arr_rev)
83110

84-
def time_add_overflow_first_arg_nan(self):
85-
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
111+
def time_add_overflow_arr_mask_nan(self):
112+
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
86113

87-
def time_add_overflow_second_arg_nan(self):
88-
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)
114+
def time_add_overflow_b_mask_nan(self):
115+
checked_add_with_arr(self.arr, self.arr_mixed,
116+
b_mask=self.arr_nan_1)
89117

90118
def time_add_overflow_both_arg_nan(self):
91-
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
92-
b_mask=self.arrmixed_nan)
119+
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
120+
b_mask=self.arr_nan_2)
93121

94122

95123
class Hashing(object):
124+
96125
goal_time = 0.2
97126

98-
def setup(self):
99-
N = 100000
100-
101-
self.df = pd.DataFrame(
102-
{'A': pd.Series(tm.makeStringIndex(100).take(
103-
np.random.randint(0, 100, size=N))),
104-
'B': pd.Series(tm.makeStringIndex(10000).take(
105-
np.random.randint(0, 10000, size=N))),
106-
'D': np.random.randn(N),
107-
'E': np.arange(N),
108-
'F': pd.date_range('20110101', freq='s', periods=N),
109-
'G': pd.timedelta_range('1 day', freq='s', periods=N),
110-
})
111-
self.df['C'] = self.df['B'].astype('category')
112-
self.df.iloc[10:20] = np.nan
113-
114-
def time_frame(self):
115-
hashing.hash_pandas_object(self.df)
116-
117-
def time_series_int(self):
118-
hashing.hash_pandas_object(self.df.E)
119-
120-
def time_series_string(self):
121-
hashing.hash_pandas_object(self.df.B)
122-
123-
def time_series_categorical(self):
124-
hashing.hash_pandas_object(self.df.C)
127+
def setup_cache(self):
128+
np.random.seed(1234)
129+
N = 10**5
130+
131+
df = pd.DataFrame(
132+
{'strings': pd.Series(tm.makeStringIndex(10000).take(
133+
np.random.randint(0, 10000, size=N))),
134+
'floats': np.random.randn(N),
135+
'ints': np.arange(N),
136+
'dates': pd.date_range('20110101', freq='s', periods=N),
137+
'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
138+
df['categories'] = df['strings'].astype('category')
139+
df.iloc[10:20] = np.nan
140+
return df
141+
142+
def time_frame(self, df):
143+
hashing.hash_pandas_object(df)
144+
145+
def time_series_int(self, df):
146+
hashing.hash_pandas_object(df['ints'])
147+
148+
def time_series_string(self, df):
149+
hashing.hash_pandas_object(df['strings'])
150+
151+
def time_series_float(self, df):
152+
hashing.hash_pandas_object(df['floats'])
153+
154+
def time_series_categorical(self, df):
155+
hashing.hash_pandas_object(df['categories'])
156+
157+
def time_series_timedeltas(self, df):
158+
hashing.hash_pandas_object(df['timedeltas'])
159+
160+
def time_series_dates(self, df):
161+
hashing.hash_pandas_object(df['dates'])

0 commit comments

Comments
 (0)