Skip to content

Commit 921b897

Browse files
committed
Reorganize factorization benchmarks to speed up initialization
1 parent 996867b commit 921b897

File tree

1 file changed

+38
-16
lines changed

1 file changed

+38
-16
lines changed

vb_suite/factorize.py

+38-16
Original file line numberDiff line numberDiff line change
@@ -5,36 +5,58 @@
55

66
# GH 8524
77

8-
setup = """from pandas_vb_common import *
8+
common_setup = """from pandas_vb_common import *
99
from pandas import factorize
1010
SIZE = 1000000
11-
12-
int_values_uniq = np.arange(SIZE) * 100
13-
str_values_uniq = tm.makeStringIndex(SIZE)
14-
float_values_uniq = np.linspace(0., 1., num=SIZE) * 100
15-
1611
indices = np.random.randint(100, size=SIZE)
17-
int_values_dup = int_values_uniq.take(indices)
18-
str_values_dup = str_values_uniq.take(indices)
19-
shortstr_values_dup = Index(np.take(['AA', 'BB', 'CC', 'DD'],
20-
np.random.randint(4, size=SIZE)))
21-
float_values_dup = float_values_uniq.take(indices)
2212
"""
2313

2414

15+
# --- Integer array factorization
16+
setup = common_setup + """
17+
int_values_uniq = np.arange(SIZE) * 100
18+
"""
2519
factorize_int_uniq = Benchmark("factorize(int_values_uniq)", setup,
2620
start_date=START_DATE)
21+
setup = common_setup + """
22+
int_values_dup = (np.arange(SIZE) * 100).take(indices)
23+
"""
2724
factorize_int_dup = Benchmark("factorize(int_values_dup)", setup,
2825
start_date=START_DATE)
2926

30-
factorize_str_uniq = Benchmark("factorize(str_values_uniq)", setup,
27+
28+
# --- String array factorization
29+
setup = common_setup + """
30+
str_values_uniq = tm.makeStringIndex(SIZE)
31+
"""
32+
factorize_str_uniq = Benchmark("factorize(str_values_uniq)", setup=setup,
3133
start_date=START_DATE)
32-
factorize_str_dup = Benchmark("factorize(str_values_dup)", setup,
34+
setup = common_setup + """
35+
str_values_dup = tm.makeStringIndex(SIZE).take(indices)
36+
"""
37+
factorize_str_dup = Benchmark("factorize(str_values_dup)", setup=setup,
3338
start_date=START_DATE)
34-
factorize_shortstr_dup = Benchmark("factorize(shortstr_values_dup)", setup,
35-
start_date=START_DATE)
39+
setup = common_setup + """
40+
shortstr_4_dup = Index(np.take(['AA', 'BB', 'CC', 'DD'],
41+
np.random.randint(4, size=SIZE)))
42+
"""
43+
factorize_shortstr_4_dup = Benchmark("factorize(shortstr_values_dup)",
44+
setup=setup, start_date=START_DATE)
45+
setup = common_setup + """
46+
shortstr_many_dup = tm.rands_array(2, SIZE)
47+
"""
48+
factorize_shortstr_many_dup = Benchmark("factorize(shortstr_many_dup)",
49+
setup=setup, start_date=START_DATE)
3650

37-
factorize_float_uniq = Benchmark("factorize(float_values_uniq)", setup,
51+
52+
# --- Float array factorization
53+
setup = common_setup + """
54+
float_values_uniq = np.linspace(0., 1., num=SIZE) * 100
55+
"""
56+
factorize_float_uniq = Benchmark("factorize(float_values_uniq)", setup=setup,
3857
start_date=START_DATE)
58+
setup = common_setup + """
59+
float_values_dup = (np.linspace(0., 1., num=SIZE) * 100).take(indices)
60+
"""
3961
factorize_float_dup = Benchmark("factorize(float_values_dup)", setup,
4062
start_date=START_DATE)

0 commit comments

Comments
 (0)