1
- from string import ascii_letters , digits
1
+ from string import ascii_letters
2
2
from itertools import product
3
3
from functools import partial
4
4
@@ -275,18 +275,12 @@ class GroupStrings(object):
275
275
276
276
def setup (self ):
277
277
n = 2 * 10 ** 5
278
- alpha = list (map ('' .join , product ((ascii_letters + digits ), repeat = 4 )))
279
- self .df = DataFrame ({'a' : np .repeat (np .random .choice (alpha ,
280
- (n // 11 )), 11 ),
281
- 'b' : np .repeat (np .random .choice (alpha ,
282
- (n // 7 )), 7 ),
283
- 'c' : np .repeat (np .random .choice (alpha ,
284
- (n // 5 )), 5 ),
285
- 'd' : np .repeat (np .random .choice (alpha ,
286
- (n // 1 )), 1 )})
278
+ alpha = list (map ('' .join , product (ascii_letters , repeat = 4 )))
279
+ data = np .random .choice (alpha , (n // 5 , 4 ), replace = False )
280
+ data = np .repeat (data , 5 , axis = 0 )
281
+ self .df = DataFrame (data , columns = list ('abcd' ))
287
282
self .df ['joe' ] = (np .random .randn (len (self .df )) * 10 ).round (3 )
288
- i = np .random .permutation (len (self .df ))
289
- self .df = self .df .iloc [i ].reset_index (drop = True )
283
+ self .df = self .df .sample (frac = 1 ).reset_index (drop = True )
290
284
291
285
def time_multi_columns (self ):
292
286
self .df .groupby (list ('abcd' )).max ()
@@ -356,10 +350,16 @@ class GroupByMethods(object):
356
350
357
351
goal_time = 0.2
358
352
359
- param_names = ['dtype' , 'ngroups' ]
360
- params = [['int' , 'float' ], [100 , 10000 ]]
353
+ param_names = ['dtype' , 'method' ]
354
+ params = [['int' , 'float' ],
355
+ ['all' , 'any' , 'count' , 'cumcount' , 'cummax' , 'cummin' ,
356
+ 'cumprod' , 'cumsum' , 'describe' , 'first' , 'head' , 'last' , 'mad' ,
357
+ 'max' , 'min' , 'median' , 'mean' , 'nunique' , 'pct_change' , 'prod' ,
358
+ 'rank' , 'sem' , 'shift' , 'size' , 'skew' , 'std' , 'sum' , 'tail' ,
359
+ 'unique' , 'value_counts' , 'var' ]]
361
360
362
- def setup (self , dtype , ngroups ):
361
+ def setup (self , dtype , method ):
362
+ ngroups = 1000
363
363
size = ngroups * 2
364
364
rng = np .arange (ngroups )
365
365
values = rng .take (np .random .randint (0 , ngroups , size = size ))
@@ -369,104 +369,11 @@ def setup(self, dtype, ngroups):
369
369
key = np .concatenate ([np .random .random (ngroups ) * 0.1 ,
370
370
np .random .random (ngroups ) * 10.0 ])
371
371
372
- self . df = DataFrame ({'values' : values ,
373
- 'key' : key } )
372
+ df = DataFrame ({'values' : values , 'key' : key })
373
+ self . df_groupby_method = getattr ( df . groupby ( 'key' )[ 'values' ], method )
374
374
375
- def time_all (self , dtype , ngroups ):
376
- self .df .groupby ('key' )['values' ].all ()
377
-
378
- def time_any (self , dtype , ngroups ):
379
- self .df .groupby ('key' )['values' ].any ()
380
-
381
- def time_count (self , dtype , ngroups ):
382
- self .df .groupby ('key' )['values' ].count ()
383
-
384
- def time_cumcount (self , dtype , ngroups ):
385
- self .df .groupby ('key' )['values' ].cumcount ()
386
-
387
- def time_cummax (self , dtype , ngroups ):
388
- self .df .groupby ('key' )['values' ].cummax ()
389
-
390
- def time_cummin (self , dtype , ngroups ):
391
- self .df .groupby ('key' )['values' ].cummin ()
392
-
393
- def time_cumprod (self , dtype , ngroups ):
394
- self .df .groupby ('key' )['values' ].cumprod ()
395
-
396
- def time_cumsum (self , dtype , ngroups ):
397
- self .df .groupby ('key' )['values' ].cumsum ()
398
-
399
- def time_describe (self , dtype , ngroups ):
400
- self .df .groupby ('key' )['values' ].describe ()
401
-
402
- def time_diff (self , dtype , ngroups ):
403
- self .df .groupby ('key' )['values' ].diff ()
404
-
405
- def time_first (self , dtype , ngroups ):
406
- self .df .groupby ('key' )['values' ].first ()
407
-
408
- def time_head (self , dtype , ngroups ):
409
- self .df .groupby ('key' )['values' ].head ()
410
-
411
- def time_last (self , dtype , ngroups ):
412
- self .df .groupby ('key' )['values' ].last ()
413
-
414
- def time_mad (self , dtype , ngroups ):
415
- self .df .groupby ('key' )['values' ].mad ()
416
-
417
- def time_max (self , dtype , ngroups ):
418
- self .df .groupby ('key' )['values' ].max ()
419
-
420
- def time_mean (self , dtype , ngroups ):
421
- self .df .groupby ('key' )['values' ].mean ()
422
-
423
- def time_median (self , dtype , ngroups ):
424
- self .df .groupby ('key' )['values' ].median ()
425
-
426
- def time_min (self , dtype , ngroups ):
427
- self .df .groupby ('key' )['values' ].min ()
428
-
429
- def time_nunique (self , dtype , ngroups ):
430
- self .df .groupby ('key' )['values' ].nunique ()
431
-
432
- def time_pct_change (self , dtype , ngroups ):
433
- self .df .groupby ('key' )['values' ].pct_change ()
434
-
435
- def time_prod (self , dtype , ngroups ):
436
- self .df .groupby ('key' )['values' ].prod ()
437
-
438
- def time_rank (self , dtype , ngroups ):
439
- self .df .groupby ('key' )['values' ].rank ()
440
-
441
- def time_sem (self , dtype , ngroups ):
442
- self .df .groupby ('key' )['values' ].sem ()
443
-
444
- def time_shift (self , dtype , ngroups ):
445
- self .df .groupby ('key' )['values' ].shift ()
446
-
447
- def time_size (self , dtype , ngroups ):
448
- self .df .groupby ('key' )['values' ].size ()
449
-
450
- def time_skew (self , dtype , ngroups ):
451
- self .df .groupby ('key' )['values' ].skew ()
452
-
453
- def time_std (self , dtype , ngroups ):
454
- self .df .groupby ('key' )['values' ].std ()
455
-
456
- def time_sum (self , dtype , ngroups ):
457
- self .df .groupby ('key' )['values' ].sum ()
458
-
459
- def time_tail (self , dtype , ngroups ):
460
- self .df .groupby ('key' )['values' ].tail ()
461
-
462
- def time_unique (self , dtype , ngroups ):
463
- self .df .groupby ('key' )['values' ].unique ()
464
-
465
- def time_value_counts (self , dtype , ngroups ):
466
- self .df .groupby ('key' )['values' ].value_counts ()
467
-
468
- def time_var (self , dtype , ngroups ):
469
- self .df .groupby ('key' )['values' ].var ()
375
+ def time_method (self , dtype , method ):
376
+ self .df_groupby_method ()
470
377
471
378
472
379
class Float32 (object ):
0 commit comments