|
1 | 1 | from importlib import import_module
|
2 | 2 |
|
3 | 3 | import numpy as np
|
4 |
| - |
5 | 4 | import pandas as pd
|
6 | 5 | from pandas.util import testing as tm
|
7 | 6 |
|
|
12 | 11 | except:
|
13 | 12 | pass
|
14 | 13 |
|
15 |
| -class Algorithms(object): |
| 14 | + |
| 15 | +class Factorize(object): |
| 16 | + |
16 | 17 | goal_time = 0.2
|
17 | 18 |
|
18 |
| - def setup(self): |
19 |
| - N = 100000 |
20 |
| - np.random.seed(1234) |
| 19 | + params = [True, False] |
| 20 | + param_names = ['sort'] |
21 | 21 |
|
22 |
| - self.int_unique = pd.Int64Index(np.arange(N * 5)) |
23 |
| - # cache is_unique |
24 |
| - self.int_unique.is_unique |
| 22 | + def setup(self, sort): |
| 23 | + N = 10**5 |
| 24 | + np.random.seed(1234) |
| 25 | + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) |
| 26 | + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) |
| 27 | + self.string_idx = tm.makeStringIndex(N) |
25 | 28 |
|
26 |
| - self.int = pd.Int64Index(np.arange(N).repeat(5)) |
27 |
| - self.float = pd.Float64Index(np.random.randn(N).repeat(5)) |
| 29 | + def time_factorize_int(self, sort): |
| 30 | + self.int_idx.factorize(sort=sort) |
28 | 31 |
|
29 |
| - # Convenience naming. |
30 |
| - self.checked_add = pd.core.algorithms.checked_add_with_arr |
| 32 | + def time_factorize_float(self, sort): |
| 33 | + self.float_idx.factorize(sort=sort) |
31 | 34 |
|
32 |
| - self.arr = np.arange(1000000) |
33 |
| - self.arrpos = np.arange(1000000) |
34 |
| - self.arrneg = np.arange(-1000000, 0) |
35 |
| - self.arrmixed = np.array([1, -1]).repeat(500000) |
36 |
| - self.strings = tm.makeStringIndex(100000) |
| 35 | + def time_factorize_string(self, sort): |
| 36 | + self.string_idx.factorize(sort=sort) |
37 | 37 |
|
38 |
| - self.arr_nan = np.random.choice([True, False], size=1000000) |
39 |
| - self.arrmixed_nan = np.random.choice([True, False], size=1000000) |
40 | 38 |
|
41 |
| - # match |
42 |
| - self.uniques = tm.makeStringIndex(1000).values |
43 |
| - self.all = self.uniques.repeat(10) |
| 39 | +class Duplicated(object): |
44 | 40 |
|
45 |
| - def time_factorize_string(self): |
46 |
| - self.strings.factorize() |
| 41 | + goal_time = 0.2 |
47 | 42 |
|
48 |
| - def time_factorize_int(self): |
49 |
| - self.int.factorize() |
| 43 | + params = ['first', 'last', False] |
| 44 | + param_names = ['keep'] |
50 | 45 |
|
51 |
| - def time_factorize_float(self): |
52 |
| - self.int.factorize() |
| 46 | + def setup(self, keep): |
| 47 | + N = 10**5 |
| 48 | + np.random.seed(1234) |
| 49 | + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) |
| 50 | + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) |
| 51 | + self.string_idx = tm.makeStringIndex(N) |
53 | 52 |
|
54 |
| - def time_duplicated_int_unique(self): |
55 |
| - self.int_unique.duplicated() |
| 53 | + def time_duplicated_int(self, keep): |
| 54 | + self.int_idx.duplicated(keep=keep) |
56 | 55 |
|
57 |
| - def time_duplicated_int(self): |
58 |
| - self.int.duplicated() |
| 56 | + def time_duplicated_float(self, keep): |
| 57 | + self.float_idx.duplicated(keep=keep) |
59 | 58 |
|
60 |
| - def time_duplicated_float(self): |
61 |
| - self.float.duplicated() |
| 59 | + def time_duplicated_string(self, keep): |
| 60 | + self.string_idx.duplicated(keep=keep) |
62 | 61 |
|
63 |
| - def time_match_strings(self): |
64 |
| - pd.match(self.all, self.uniques) |
65 | 62 |
|
66 |
| - def time_add_overflow_pos_scalar(self): |
67 |
| - self.checked_add(self.arr, 1) |
| 63 | +class DuplicatedUniqueIndex(object): |
68 | 64 |
|
69 |
| - def time_add_overflow_neg_scalar(self): |
70 |
| - self.checked_add(self.arr, -1) |
| 65 | + goal_time = 0.2 |
71 | 66 |
|
72 |
| - def time_add_overflow_zero_scalar(self): |
73 |
| - self.checked_add(self.arr, 0) |
| 67 | + def setup(self): |
| 68 | + N = 10**5 |
| 69 | + self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) |
| 70 | + # cache is_unique |
| 71 | + self.idx_int_dup.is_unique |
74 | 72 |
|
75 |
| - def time_add_overflow_pos_arr(self): |
76 |
| - self.checked_add(self.arr, self.arrpos) |
| 73 | + def time_duplicated_unique_int(self): |
| 74 | + self.idx_int_dup.duplicated() |
77 | 75 |
|
78 |
| - def time_add_overflow_neg_arr(self): |
79 |
| - self.checked_add(self.arr, self.arrneg) |
80 | 76 |
|
81 |
| - def time_add_overflow_mixed_arr(self): |
82 |
| - self.checked_add(self.arr, self.arrmixed) |
| 77 | +class Match(object): |
83 | 78 |
|
84 |
| - def time_add_overflow_first_arg_nan(self): |
85 |
| - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) |
| 79 | + goal_time = 0.2 |
86 | 80 |
|
87 |
| - def time_add_overflow_second_arg_nan(self): |
88 |
| - self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan) |
| 81 | + def setup(self): |
| 82 | + np.random.seed(1234) |
| 83 | + self.uniques = tm.makeStringIndex(1000).values |
| 84 | + self.all = self.uniques.repeat(10) |
89 | 85 |
|
90 |
| - def time_add_overflow_both_arg_nan(self): |
91 |
| - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, |
92 |
| - b_mask=self.arrmixed_nan) |
| 86 | + def time_match_string(self): |
| 87 | + pd.match(self.all, self.uniques) |
93 | 88 |
|
94 | 89 |
|
95 | 90 | class Hashing(object):
|
| 91 | + |
96 | 92 | goal_time = 0.2
|
97 | 93 |
|
98 |
| - def setup(self): |
99 |
| - N = 100000 |
100 |
| - |
101 |
| - self.df = pd.DataFrame( |
102 |
| - {'A': pd.Series(tm.makeStringIndex(100).take( |
103 |
| - np.random.randint(0, 100, size=N))), |
104 |
| - 'B': pd.Series(tm.makeStringIndex(10000).take( |
105 |
| - np.random.randint(0, 10000, size=N))), |
106 |
| - 'D': np.random.randn(N), |
107 |
| - 'E': np.arange(N), |
108 |
| - 'F': pd.date_range('20110101', freq='s', periods=N), |
109 |
| - 'G': pd.timedelta_range('1 day', freq='s', periods=N), |
110 |
| - }) |
111 |
| - self.df['C'] = self.df['B'].astype('category') |
112 |
| - self.df.iloc[10:20] = np.nan |
113 |
| - |
114 |
| - def time_frame(self): |
115 |
| - hashing.hash_pandas_object(self.df) |
116 |
| - |
117 |
| - def time_series_int(self): |
118 |
| - hashing.hash_pandas_object(self.df.E) |
119 |
| - |
120 |
| - def time_series_string(self): |
121 |
| - hashing.hash_pandas_object(self.df.B) |
122 |
| - |
123 |
| - def time_series_categorical(self): |
124 |
| - hashing.hash_pandas_object(self.df.C) |
| 94 | + def setup_cache(self): |
| 95 | + np.random.seed(1234) |
| 96 | + N = 10**5 |
| 97 | + |
| 98 | + df = pd.DataFrame( |
| 99 | + {'strings': pd.Series(tm.makeStringIndex(10000).take( |
| 100 | + np.random.randint(0, 10000, size=N))), |
| 101 | + 'floats': np.random.randn(N), |
| 102 | + 'ints': np.arange(N), |
| 103 | + 'dates': pd.date_range('20110101', freq='s', periods=N), |
| 104 | + 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) |
| 105 | + df['categories'] = df['strings'].astype('category') |
| 106 | + df.iloc[10:20] = np.nan |
| 107 | + return df |
| 108 | + |
| 109 | + def time_frame(self, df): |
| 110 | + hashing.hash_pandas_object(df) |
| 111 | + |
| 112 | + def time_series_int(self, df): |
| 113 | + hashing.hash_pandas_object(df['ints']) |
| 114 | + |
| 115 | + def time_series_string(self, df): |
| 116 | + hashing.hash_pandas_object(df['strings']) |
| 117 | + |
| 118 | + def time_series_float(self, df): |
| 119 | + hashing.hash_pandas_object(df['floats']) |
| 120 | + |
| 121 | + def time_series_categorical(self, df): |
| 122 | + hashing.hash_pandas_object(df['categories']) |
| 123 | + |
| 124 | + def time_series_timedeltas(self, df): |
| 125 | + hashing.hash_pandas_object(df['timedeltas']) |
| 126 | + |
| 127 | + def time_series_dates(self, df): |
| 128 | + hashing.hash_pandas_object(df['dates']) |
0 commit comments