|
1 | 1 | from importlib import import_module
|
2 | 2 |
|
3 | 3 | import numpy as np
|
4 |
| - |
5 | 4 | import pandas as pd
|
6 | 5 | from pandas.util import testing as tm
|
| 6 | +from pandas.core.algorithms import checked_add_with_arr |
7 | 7 |
|
8 | 8 | for imp in ['pandas.util', 'pandas.tools.hashing']:
|
9 | 9 | try:
|
|
12 | 12 | except:
|
13 | 13 | pass
|
14 | 14 |
|
15 |
| -class Algorithms(object): |
| 15 | + |
| 16 | +class Factorize(object): |
| 17 | + |
| 18 | + goal_time = 0.2 |
| 19 | + |
| 20 | + def setup(self): |
| 21 | + N = 10**5 |
| 22 | + np.random.seed(1234) |
| 23 | + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) |
| 24 | + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) |
| 25 | + self.string_idx = tm.makeStringIndex(N) |
| 26 | + |
| 27 | + def time_factorize_int(self): |
| 28 | + self.int_idx.factorize() |
| 29 | + |
| 30 | + def time_factorize_float(self): |
| 31 | + self.float_idx.factorize() |
| 32 | + |
| 33 | + def time_factorize_string(self): |
| 34 | + self.string_idx.factorize() |
| 35 | + |
| 36 | + |
| 37 | +class Duplicated(object): |
| 38 | + |
16 | 39 | goal_time = 0.2
|
17 | 40 |
|
18 | 41 | def setup(self):
|
19 |
| - N = 100000 |
| 42 | + N = 10**5 |
20 | 43 | np.random.seed(1234)
|
| 44 | + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) |
| 45 | + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) |
| 46 | + |
| 47 | + def time_duplicated_int(self): |
| 48 | + self.int_idx.duplicated() |
| 49 | + |
| 50 | + def time_duplicated_float(self): |
| 51 | + self.float_idx.duplicated() |
21 | 52 |
|
22 |
| - self.int_unique = pd.Int64Index(np.arange(N * 5)) |
| 53 | + |
| 54 | +class DuplicatedUniqueIndex(object): |
| 55 | + |
| 56 | + goal_time = 0.2 |
| 57 | + |
| 58 | + def setup(self): |
| 59 | + N = 10**5 |
| 60 | + self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) |
23 | 61 | # cache is_unique
|
24 |
| - self.int_unique.is_unique |
| 62 | + self.idx_int_dup.is_unique |
25 | 63 |
|
26 |
| - self.int = pd.Int64Index(np.arange(N).repeat(5)) |
27 |
| - self.float = pd.Float64Index(np.random.randn(N).repeat(5)) |
| 64 | + def time_duplicated_unique_int(self): |
| 65 | + self.idx_int_dup.duplicated() |
28 | 66 |
|
29 |
| - # Convenience naming. |
30 |
| - self.checked_add = pd.core.algorithms.checked_add_with_arr |
31 | 67 |
|
32 |
| - self.arr = np.arange(1000000) |
33 |
| - self.arrpos = np.arange(1000000) |
34 |
| - self.arrneg = np.arange(-1000000, 0) |
35 |
| - self.arrmixed = np.array([1, -1]).repeat(500000) |
36 |
| - self.strings = tm.makeStringIndex(100000) |
| 68 | +class Match(object): |
37 | 69 |
|
38 |
| - self.arr_nan = np.random.choice([True, False], size=1000000) |
39 |
| - self.arrmixed_nan = np.random.choice([True, False], size=1000000) |
| 70 | + goal_time = 0.2 |
40 | 71 |
|
41 |
| - # match |
| 72 | + def setup(self): |
| 73 | + np.random.seed(1234) |
42 | 74 | self.uniques = tm.makeStringIndex(1000).values
|
43 | 75 | self.all = self.uniques.repeat(10)
|
44 | 76 |
|
45 |
| - def time_factorize_string(self): |
46 |
| - self.strings.factorize() |
47 |
| - |
48 |
| - def time_factorize_int(self): |
49 |
| - self.int.factorize() |
| 77 | + def time_match_string(self): |
| 78 | + pd.match(self.all, self.uniques) |
50 | 79 |
|
51 |
| - def time_factorize_float(self): |
52 |
| - self.int.factorize() |
53 | 80 |
|
54 |
| - def time_duplicated_int_unique(self): |
55 |
| - self.int_unique.duplicated() |
| 81 | +class AddOverflowScalar(object): |
56 | 82 |
|
57 |
| - def time_duplicated_int(self): |
58 |
| - self.int.duplicated() |
| 83 | + goal_time = 0.2 |
59 | 84 |
|
60 |
| - def time_duplicated_float(self): |
61 |
| - self.float.duplicated() |
| 85 | + params = [1, -1, 0] |
62 | 86 |
|
63 |
| - def time_match_strings(self): |
64 |
| - pd.match(self.all, self.uniques) |
| 87 | + def setup(self, scalar): |
| 88 | + N = 10**6 |
| 89 | + self.arr = np.arange(N) |
65 | 90 |
|
66 |
| - def time_add_overflow_pos_scalar(self): |
67 |
| - self.checked_add(self.arr, 1) |
| 91 | + def time_add_overflow_scalar(self, scalar): |
| 92 | + checked_add_with_arr(self.arr, scalar) |
68 | 93 |
|
69 |
| - def time_add_overflow_neg_scalar(self): |
70 |
| - self.checked_add(self.arr, -1) |
71 | 94 |
|
72 |
| - def time_add_overflow_zero_scalar(self): |
73 |
| - self.checked_add(self.arr, 0) |
| 95 | +class AddOverflowArray(object): |
74 | 96 |
|
75 |
| - def time_add_overflow_pos_arr(self): |
76 |
| - self.checked_add(self.arr, self.arrpos) |
| 97 | + goal_time = 0.2 |
77 | 98 |
|
78 |
| - def time_add_overflow_neg_arr(self): |
79 |
| - self.checked_add(self.arr, self.arrneg) |
| 99 | + def setup(self): |
| 100 | + np.random.seed(1234) |
| 101 | + N = 10**6 |
| 102 | + self.arr = np.arange(N) |
| 103 | + self.arr_rev = np.arange(-N, 0) |
| 104 | + self.arr_mixed = np.array([1, -1]).repeat(N / 2) |
| 105 | + self.arr_nan_1 = np.random.choice([True, False], size=N) |
| 106 | + self.arr_nan_2 = np.random.choice([True, False], size=N) |
80 | 107 |
|
81 |
| - def time_add_overflow_mixed_arr(self): |
82 |
| - self.checked_add(self.arr, self.arrmixed) |
| 108 | + def time_add_overflow_arr_rev(self): |
| 109 | + checked_add_with_arr(self.arr, self.arr_rev) |
83 | 110 |
|
84 |
| - def time_add_overflow_first_arg_nan(self): |
85 |
| - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) |
| 111 | + def time_add_overflow_arr_mask_nan(self): |
| 112 | + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) |
86 | 113 |
|
87 |
| - def time_add_overflow_second_arg_nan(self): |
88 |
| - self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan) |
| 114 | + def time_add_overflow_b_mask_nan(self): |
| 115 | + checked_add_with_arr(self.arr, self.arr_mixed, |
| 116 | + b_mask=self.arr_nan_1) |
89 | 117 |
|
90 | 118 | def time_add_overflow_both_arg_nan(self):
|
91 |
| - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, |
92 |
| - b_mask=self.arrmixed_nan) |
| 119 | + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, |
| 120 | + b_mask=self.arr_nan_2) |
93 | 121 |
|
94 | 122 |
|
95 | 123 | class Hashing(object):
|
| 124 | + |
96 | 125 | goal_time = 0.2
|
97 | 126 |
|
98 |
| - def setup(self): |
99 |
| - N = 100000 |
100 |
| - |
101 |
| - self.df = pd.DataFrame( |
102 |
| - {'A': pd.Series(tm.makeStringIndex(100).take( |
103 |
| - np.random.randint(0, 100, size=N))), |
104 |
| - 'B': pd.Series(tm.makeStringIndex(10000).take( |
105 |
| - np.random.randint(0, 10000, size=N))), |
106 |
| - 'D': np.random.randn(N), |
107 |
| - 'E': np.arange(N), |
108 |
| - 'F': pd.date_range('20110101', freq='s', periods=N), |
109 |
| - 'G': pd.timedelta_range('1 day', freq='s', periods=N), |
110 |
| - }) |
111 |
| - self.df['C'] = self.df['B'].astype('category') |
112 |
| - self.df.iloc[10:20] = np.nan |
113 |
| - |
114 |
| - def time_frame(self): |
115 |
| - hashing.hash_pandas_object(self.df) |
116 |
| - |
117 |
| - def time_series_int(self): |
118 |
| - hashing.hash_pandas_object(self.df.E) |
119 |
| - |
120 |
| - def time_series_string(self): |
121 |
| - hashing.hash_pandas_object(self.df.B) |
122 |
| - |
123 |
| - def time_series_categorical(self): |
124 |
| - hashing.hash_pandas_object(self.df.C) |
| 127 | + def setup_cache(self): |
| 128 | + np.random.seed(1234) |
| 129 | + N = 10**5 |
| 130 | + |
| 131 | + df = pd.DataFrame( |
| 132 | + {'strings': pd.Series(tm.makeStringIndex(10000).take( |
| 133 | + np.random.randint(0, 10000, size=N))), |
| 134 | + 'floats': np.random.randn(N), |
| 135 | + 'ints': np.arange(N), |
| 136 | + 'dates': pd.date_range('20110101', freq='s', periods=N), |
| 137 | + 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) |
| 138 | + df['categories'] = df['strings'].astype('category') |
| 139 | + df.iloc[10:20] = np.nan |
| 140 | + return df |
| 141 | + |
| 142 | + def time_frame(self, df): |
| 143 | + hashing.hash_pandas_object(df) |
| 144 | + |
| 145 | + def time_series_int(self, df): |
| 146 | + hashing.hash_pandas_object(df['ints']) |
| 147 | + |
| 148 | + def time_series_string(self, df): |
| 149 | + hashing.hash_pandas_object(df['strings']) |
| 150 | + |
| 151 | + def time_series_float(self, df): |
| 152 | + hashing.hash_pandas_object(df['floats']) |
| 153 | + |
| 154 | + def time_series_categorical(self, df): |
| 155 | + hashing.hash_pandas_object(df['categories']) |
| 156 | + |
| 157 | + def time_series_timedeltas(self, df): |
| 158 | + hashing.hash_pandas_object(df['timedeltas']) |
| 159 | + |
| 160 | + def time_series_dates(self, df): |
| 161 | + hashing.hash_pandas_object(df['dates']) |
0 commit comments