14
14
method_blacklist = {
15
15
'object' : {'median' , 'prod' , 'sem' , 'cumsum' , 'sum' , 'cummin' , 'mean' ,
16
16
'max' , 'skew' , 'cumprod' , 'cummax' , 'rank' , 'pct_change' , 'min' ,
17
- 'var' , 'mad' , 'describe' , 'std' }
17
+ 'var' , 'mad' , 'describe' , 'std' },
18
+ 'datetime' : {'median' , 'prod' , 'sem' , 'cumsum' , 'sum' , 'mean' , 'skew' ,
19
+ 'cumprod' , 'cummax' , 'pct_change' , 'var' , 'mad' , 'describe' ,
20
+ 'std' }
18
21
}
19
22
20
23
@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
90
93
self .ser .groupby (self .ser ).groups
91
94
92
95
93
- class FirstLast (object ):
94
-
95
- goal_time = 0.2
96
-
97
- param_names = ['dtype' ]
98
- params = ['float32' , 'float64' , 'datetime' , 'object' ]
99
-
100
- def setup (self , dtype ):
101
- N = 10 ** 5
102
- # with datetimes (GH7555)
103
- if dtype == 'datetime' :
104
- self .df = DataFrame ({'values' : date_range ('1/1/2011' ,
105
- periods = N ,
106
- freq = 's' ),
107
- 'key' : range (N )})
108
- elif dtype == 'object' :
109
- self .df = DataFrame ({'values' : ['foo' ] * N ,
110
- 'key' : range (N )})
111
- else :
112
- labels = np .arange (N / 10 ).repeat (10 )
113
- data = Series (np .random .randn (len (labels )), dtype = dtype )
114
- data [::3 ] = np .nan
115
- data [1 ::3 ] = np .nan
116
- labels = labels .take (np .random .permutation (len (labels )))
117
- self .df = DataFrame ({'values' : data , 'key' : labels })
118
-
119
- def time_groupby_first (self , dtype ):
120
- self .df .groupby ('key' ).first ()
121
-
122
- def time_groupby_last (self , dtype ):
123
- self .df .groupby ('key' ).last ()
124
-
125
- def time_groupby_nth_all (self , dtype ):
126
- self .df .groupby ('key' ).nth (0 , dropna = 'all' )
127
-
128
- def time_groupby_nth_none (self , dtype ):
129
- self .df .groupby ('key' ).nth (0 )
130
-
131
-
132
96
class GroupManyLabels (object ):
133
97
134
98
goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
149
113
150
114
goal_time = 0.2
151
115
152
- def setup_cache (self ):
153
- df = DataFrame (np .random .randint (1 , 100 , (10000 , 2 )))
154
- df .iloc [1 , 1 ] = np .nan
155
- return df
156
-
157
- def time_frame_nth_any (self , df ):
158
- df .groupby (0 ).nth (0 , dropna = 'any' )
159
-
160
- def time_frame_nth (self , df ):
161
- df .groupby (0 ).nth (0 )
162
-
116
+ param_names = ['dtype' ]
117
+ params = ['float32' , 'float64' , 'datetime' , 'object' ]
163
118
164
- def time_series_nth_any (self , df ):
165
- df [1 ].groupby (df [0 ]).nth (0 , dropna = 'any' )
119
+ def setup (self , dtype ):
120
+ N = 10 ** 5
121
+ # with datetimes (GH7555)
122
+ if dtype == 'datetime' :
123
+ values = date_range ('1/1/2011' , periods = N , freq = 's' )
124
+ elif dtype == 'object' :
125
+ values = ['foo' ] * N
126
+ else :
127
+ values = np .arange (N ).astype (dtype )
166
128
167
- def time_series_nth (self , df ):
168
- df [1 ].groupby (df [0 ]).nth (0 )
129
+ key = np .arange (N )
130
+ self .df = DataFrame ({'key' : key , 'values' : values })
131
+ self .df .iloc [1 , 1 ] = np .nan # insert missing data
169
132
133
+ def time_frame_nth_any (self , dtype ):
134
+ self .df .groupby ('key' ).nth (0 , dropna = 'any' )
170
135
171
- class NthObject (object ):
136
+ def time_groupby_nth_all (self , dtype ):
137
+ self .df .groupby ('key' ).nth (0 , dropna = 'all' )
172
138
173
- goal_time = 0.2
139
+ def time_frame_nth (self , dtype ):
140
+ self .df .groupby ('key' ).nth (0 )
174
141
175
- def setup_cache (self ):
176
- df = DataFrame (np .random .randint (1 , 100 , (10000 ,)), columns = ['g' ])
177
- df ['obj' ] = ['a' ] * 5000 + ['b' ] * 5000
178
- return df
142
+ def time_series_nth_any (self , dtype ):
143
+ self .df ['values' ].groupby (self .df ['key' ]).nth (0 , dropna = 'any' )
179
144
180
- def time_nth (self , df ):
181
- df .groupby ('g' ).nth (5 )
145
+ def time_groupby_nth_all (self , dtype ):
146
+ self . df [ 'values' ] .groupby (self . df [ 'key' ] ).nth (0 , dropna = 'all' )
182
147
183
- def time_nth_last (self , df ):
184
- df .groupby ('g' ). last ( )
148
+ def time_series_nth (self , dtype ):
149
+ self . df [ 'values' ] .groupby (self . df [ 'key' ]). nth ( 0 )
185
150
186
151
187
152
class DateAttributes (object ):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
243
208
df .groupby (['key1' , 'key2' ]).count ()
244
209
245
210
246
- class CountInt (object ):
211
+ class CountMultiInt (object ):
247
212
248
213
goal_time = 0.2
249
214
@@ -255,18 +220,18 @@ def setup_cache(self):
255
220
'ints2' : np .random .randint (0 , 1000 , size = n )})
256
221
return df
257
222
258
- def time_int_count (self , df ):
223
+ def time_multi_int_count (self , df ):
259
224
df .groupby (['key1' , 'key2' ]).count ()
260
225
261
- def time_int_nunique (self , df ):
226
+ def time_multi_int_nunique (self , df ):
262
227
df .groupby (['key1' , 'key2' ]).nunique ()
263
228
264
229
265
230
class AggFunctions (object ):
266
231
267
232
goal_time = 0.2
268
233
269
- def setup_cache (self ):
234
+ def setup_cache ():
270
235
N = 10 ** 5
271
236
fac1 = np .array (['A' , 'B' , 'C' ], dtype = 'O' )
272
237
fac2 = np .array (['one' , 'two' ], dtype = 'O' )
@@ -361,9 +326,6 @@ def setup(self):
361
326
def time_multi_size (self ):
362
327
self .df .groupby (['key1' , 'key2' ]).size ()
363
328
364
- def time_dt_size (self ):
365
- self .df .groupby (['dates' ]).size ()
366
-
367
329
def time_dt_timegrouper_size (self ):
368
330
with warnings .catch_warnings (record = True ):
369
331
self .df .groupby (TimeGrouper (key = 'dates' , freq = 'M' )).size ()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
376
338
377
339
goal_time = 0.2
378
340
379
- param_names = ['dtype' , 'method' ]
380
- params = [['int' , 'float' , 'object' ],
341
+ param_names = ['dtype' , 'method' , 'application' ]
342
+ params = [['int' , 'float' , 'object' , 'datetime' ],
381
343
['all' , 'any' , 'bfill' , 'count' , 'cumcount' , 'cummax' , 'cummin' ,
382
344
'cumprod' , 'cumsum' , 'describe' , 'ffill' , 'first' , 'head' ,
383
345
'last' , 'mad' , 'max' , 'min' , 'median' , 'mean' , 'nunique' ,
384
346
'pct_change' , 'prod' , 'rank' , 'sem' , 'shift' , 'size' , 'skew' ,
385
- 'std' , 'sum' , 'tail' , 'unique' , 'value_counts' , 'var' ]]
347
+ 'std' , 'sum' , 'tail' , 'unique' , 'value_counts' , 'var' ],
348
+ ['direct' , 'transformation' ]]
386
349
387
- def setup (self , dtype , method ):
350
+ def setup (self , dtype , method , application ):
388
351
if method in method_blacklist .get (dtype , {}):
389
352
raise NotImplementedError # skip benchmark
390
353
ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
398
361
np .random .random (ngroups ) * 10.0 ])
399
362
elif dtype == 'object' :
400
363
key = ['foo' ] * size
364
+ elif dtype == 'datetime' :
365
+ key = date_range ('1/1/2011' , periods = size , freq = 's' )
401
366
402
367
df = DataFrame ({'values' : values , 'key' : key })
403
- self .df_groupby_method = getattr (df .groupby ('key' )['values' ], method )
404
368
405
- def time_method (self , dtype , method ):
406
- self .df_groupby_method ()
369
+ if application == 'transform' :
370
+ if method == 'describe' :
371
+ raise NotImplementedError
372
+
373
+ self .as_group_method = lambda : df .groupby (
374
+ 'key' )['values' ].transform (method )
375
+ self .as_field_method = lambda : df .groupby (
376
+ 'values' )['key' ].transform (method )
377
+ else :
378
+ self .as_group_method = getattr (df .groupby ('key' )['values' ], method )
379
+ self .as_field_method = getattr (df .groupby ('values' )['key' ], method )
380
+
381
+ def time_dtype_as_group (self , dtype , method , application ):
382
+ self .as_group_method ()
383
+
384
+ def time_dtype_as_field (self , dtype , method , application ):
385
+ self .as_field_method ()
407
386
408
387
409
388
class Float32 (object ):
0 commit comments