Skip to content

Commit 1fab808

Browse files
mroeschkejreback
authored andcommitted
CLN: ASV Algorithms benchmark (#18423)
1 parent 3d44221 commit 1fab808

File tree

2 files changed

+130
-82
lines changed

2 files changed

+130
-82
lines changed

asv_bench/benchmarks/algorithms.py

+86-82
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from importlib import import_module
22

33
import numpy as np
4-
54
import pandas as pd
65
from pandas.util import testing as tm
76

@@ -12,113 +11,118 @@
1211
except:
1312
pass
1413

15-
class Algorithms(object):
14+
15+
class Factorize(object):
16+
1617
goal_time = 0.2
1718

18-
def setup(self):
19-
N = 100000
20-
np.random.seed(1234)
19+
params = [True, False]
20+
param_names = ['sort']
2121

22-
self.int_unique = pd.Int64Index(np.arange(N * 5))
23-
# cache is_unique
24-
self.int_unique.is_unique
22+
def setup(self, sort):
23+
N = 10**5
24+
np.random.seed(1234)
25+
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
26+
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
27+
self.string_idx = tm.makeStringIndex(N)
2528

26-
self.int = pd.Int64Index(np.arange(N).repeat(5))
27-
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
29+
def time_factorize_int(self, sort):
30+
self.int_idx.factorize(sort=sort)
2831

29-
# Convenience naming.
30-
self.checked_add = pd.core.algorithms.checked_add_with_arr
32+
def time_factorize_float(self, sort):
33+
self.float_idx.factorize(sort=sort)
3134

32-
self.arr = np.arange(1000000)
33-
self.arrpos = np.arange(1000000)
34-
self.arrneg = np.arange(-1000000, 0)
35-
self.arrmixed = np.array([1, -1]).repeat(500000)
36-
self.strings = tm.makeStringIndex(100000)
35+
def time_factorize_string(self, sort):
36+
self.string_idx.factorize(sort=sort)
3737

38-
self.arr_nan = np.random.choice([True, False], size=1000000)
39-
self.arrmixed_nan = np.random.choice([True, False], size=1000000)
4038

41-
# match
42-
self.uniques = tm.makeStringIndex(1000).values
43-
self.all = self.uniques.repeat(10)
39+
class Duplicated(object):
4440

45-
def time_factorize_string(self):
46-
self.strings.factorize()
41+
goal_time = 0.2
4742

48-
def time_factorize_int(self):
49-
self.int.factorize()
43+
params = ['first', 'last', False]
44+
param_names = ['keep']
5045

51-
def time_factorize_float(self):
52-
self.int.factorize()
46+
def setup(self, keep):
47+
N = 10**5
48+
np.random.seed(1234)
49+
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
50+
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
51+
self.string_idx = tm.makeStringIndex(N)
5352

54-
def time_duplicated_int_unique(self):
55-
self.int_unique.duplicated()
53+
def time_duplicated_int(self, keep):
54+
self.int_idx.duplicated(keep=keep)
5655

57-
def time_duplicated_int(self):
58-
self.int.duplicated()
56+
def time_duplicated_float(self, keep):
57+
self.float_idx.duplicated(keep=keep)
5958

60-
def time_duplicated_float(self):
61-
self.float.duplicated()
59+
def time_duplicated_string(self, keep):
60+
self.string_idx.duplicated(keep=keep)
6261

63-
def time_match_strings(self):
64-
pd.match(self.all, self.uniques)
6562

66-
def time_add_overflow_pos_scalar(self):
67-
self.checked_add(self.arr, 1)
63+
class DuplicatedUniqueIndex(object):
6864

69-
def time_add_overflow_neg_scalar(self):
70-
self.checked_add(self.arr, -1)
65+
goal_time = 0.2
7166

72-
def time_add_overflow_zero_scalar(self):
73-
self.checked_add(self.arr, 0)
67+
def setup(self):
68+
N = 10**5
69+
self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
70+
# cache is_unique
71+
self.idx_int_dup.is_unique
7472

75-
def time_add_overflow_pos_arr(self):
76-
self.checked_add(self.arr, self.arrpos)
73+
def time_duplicated_unique_int(self):
74+
self.idx_int_dup.duplicated()
7775

78-
def time_add_overflow_neg_arr(self):
79-
self.checked_add(self.arr, self.arrneg)
8076

81-
def time_add_overflow_mixed_arr(self):
82-
self.checked_add(self.arr, self.arrmixed)
77+
class Match(object):
8378

84-
def time_add_overflow_first_arg_nan(self):
85-
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
79+
goal_time = 0.2
8680

87-
def time_add_overflow_second_arg_nan(self):
88-
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)
81+
def setup(self):
82+
np.random.seed(1234)
83+
self.uniques = tm.makeStringIndex(1000).values
84+
self.all = self.uniques.repeat(10)
8985

90-
def time_add_overflow_both_arg_nan(self):
91-
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
92-
b_mask=self.arrmixed_nan)
86+
def time_match_string(self):
87+
pd.match(self.all, self.uniques)
9388

9489

9590
class Hashing(object):
91+
9692
goal_time = 0.2
9793

98-
def setup(self):
99-
N = 100000
100-
101-
self.df = pd.DataFrame(
102-
{'A': pd.Series(tm.makeStringIndex(100).take(
103-
np.random.randint(0, 100, size=N))),
104-
'B': pd.Series(tm.makeStringIndex(10000).take(
105-
np.random.randint(0, 10000, size=N))),
106-
'D': np.random.randn(N),
107-
'E': np.arange(N),
108-
'F': pd.date_range('20110101', freq='s', periods=N),
109-
'G': pd.timedelta_range('1 day', freq='s', periods=N),
110-
})
111-
self.df['C'] = self.df['B'].astype('category')
112-
self.df.iloc[10:20] = np.nan
113-
114-
def time_frame(self):
115-
hashing.hash_pandas_object(self.df)
116-
117-
def time_series_int(self):
118-
hashing.hash_pandas_object(self.df.E)
119-
120-
def time_series_string(self):
121-
hashing.hash_pandas_object(self.df.B)
122-
123-
def time_series_categorical(self):
124-
hashing.hash_pandas_object(self.df.C)
94+
def setup_cache(self):
95+
np.random.seed(1234)
96+
N = 10**5
97+
98+
df = pd.DataFrame(
99+
{'strings': pd.Series(tm.makeStringIndex(10000).take(
100+
np.random.randint(0, 10000, size=N))),
101+
'floats': np.random.randn(N),
102+
'ints': np.arange(N),
103+
'dates': pd.date_range('20110101', freq='s', periods=N),
104+
'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
105+
df['categories'] = df['strings'].astype('category')
106+
df.iloc[10:20] = np.nan
107+
return df
108+
109+
def time_frame(self, df):
110+
hashing.hash_pandas_object(df)
111+
112+
def time_series_int(self, df):
113+
hashing.hash_pandas_object(df['ints'])
114+
115+
def time_series_string(self, df):
116+
hashing.hash_pandas_object(df['strings'])
117+
118+
def time_series_float(self, df):
119+
hashing.hash_pandas_object(df['floats'])
120+
121+
def time_series_categorical(self, df):
122+
hashing.hash_pandas_object(df['categories'])
123+
124+
def time_series_timedeltas(self, df):
125+
hashing.hash_pandas_object(df['timedeltas'])
126+
127+
def time_series_dates(self, df):
128+
hashing.hash_pandas_object(df['dates'])

asv_bench/benchmarks/binary_ops.py

+44
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22
from pandas import DataFrame, Series, date_range
3+
from pandas.core.algorithms import checked_add_with_arr
34
try:
45
import pandas.core.computation.expressions as expr
56
except ImportError:
@@ -108,3 +109,46 @@ def time_timestamp_ops_diff(self, tz):
108109

109110
def time_timestamp_ops_diff_with_shift(self, tz):
110111
self.s - self.s.shift()
112+
113+
114+
class AddOverflowScalar(object):
115+
116+
goal_time = 0.2
117+
118+
params = [1, -1, 0]
119+
param_names = ['scalar']
120+
121+
def setup(self, scalar):
122+
N = 10**6
123+
self.arr = np.arange(N)
124+
125+
def time_add_overflow_scalar(self, scalar):
126+
checked_add_with_arr(self.arr, scalar)
127+
128+
129+
class AddOverflowArray(object):
130+
131+
goal_time = 0.2
132+
133+
def setup(self):
134+
np.random.seed(1234)
135+
N = 10**6
136+
self.arr = np.arange(N)
137+
self.arr_rev = np.arange(-N, 0)
138+
self.arr_mixed = np.array([1, -1]).repeat(N / 2)
139+
self.arr_nan_1 = np.random.choice([True, False], size=N)
140+
self.arr_nan_2 = np.random.choice([True, False], size=N)
141+
142+
def time_add_overflow_arr_rev(self):
143+
checked_add_with_arr(self.arr, self.arr_rev)
144+
145+
def time_add_overflow_arr_mask_nan(self):
146+
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
147+
148+
def time_add_overflow_b_mask_nan(self):
149+
checked_add_with_arr(self.arr, self.arr_mixed,
150+
b_mask=self.arr_nan_1)
151+
152+
def time_add_overflow_both_arg_nan(self):
153+
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
154+
b_mask=self.arr_nan_2)

0 commit comments

Comments
 (0)