Skip to content

Commit da6f827

Browse files
WillAydjreback
authored andcommitted
Refactored GroupBy ASVs (pandas-dev#20043)
1 parent bd31f71 commit da6f827

File tree

1 file changed

+58
-79
lines changed

1 file changed

+58
-79
lines changed

asv_bench/benchmarks/groupby.py

+58-79
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
method_blacklist = {
1515
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
1616
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
17-
'var', 'mad', 'describe', 'std'}
17+
'var', 'mad', 'describe', 'std'},
18+
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
19+
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
20+
'std'}
1821
}
1922

2023

@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
9093
self.ser.groupby(self.ser).groups
9194

9295

93-
class FirstLast(object):
94-
95-
goal_time = 0.2
96-
97-
param_names = ['dtype']
98-
params = ['float32', 'float64', 'datetime', 'object']
99-
100-
def setup(self, dtype):
101-
N = 10**5
102-
# with datetimes (GH7555)
103-
if dtype == 'datetime':
104-
self.df = DataFrame({'values': date_range('1/1/2011',
105-
periods=N,
106-
freq='s'),
107-
'key': range(N)})
108-
elif dtype == 'object':
109-
self.df = DataFrame({'values': ['foo'] * N,
110-
'key': range(N)})
111-
else:
112-
labels = np.arange(N / 10).repeat(10)
113-
data = Series(np.random.randn(len(labels)), dtype=dtype)
114-
data[::3] = np.nan
115-
data[1::3] = np.nan
116-
labels = labels.take(np.random.permutation(len(labels)))
117-
self.df = DataFrame({'values': data, 'key': labels})
118-
119-
def time_groupby_first(self, dtype):
120-
self.df.groupby('key').first()
121-
122-
def time_groupby_last(self, dtype):
123-
self.df.groupby('key').last()
124-
125-
def time_groupby_nth_all(self, dtype):
126-
self.df.groupby('key').nth(0, dropna='all')
127-
128-
def time_groupby_nth_none(self, dtype):
129-
self.df.groupby('key').nth(0)
130-
131-
13296
class GroupManyLabels(object):
13397

13498
goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
149113

150114
goal_time = 0.2
151115

152-
def setup_cache(self):
153-
df = DataFrame(np.random.randint(1, 100, (10000, 2)))
154-
df.iloc[1, 1] = np.nan
155-
return df
156-
157-
def time_frame_nth_any(self, df):
158-
df.groupby(0).nth(0, dropna='any')
159-
160-
def time_frame_nth(self, df):
161-
df.groupby(0).nth(0)
162-
116+
param_names = ['dtype']
117+
params = ['float32', 'float64', 'datetime', 'object']
163118

164-
def time_series_nth_any(self, df):
165-
df[1].groupby(df[0]).nth(0, dropna='any')
119+
def setup(self, dtype):
120+
N = 10**5
121+
# with datetimes (GH7555)
122+
if dtype == 'datetime':
123+
values = date_range('1/1/2011', periods=N, freq='s')
124+
elif dtype == 'object':
125+
values = ['foo'] * N
126+
else:
127+
values = np.arange(N).astype(dtype)
166128

167-
def time_series_nth(self, df):
168-
df[1].groupby(df[0]).nth(0)
129+
key = np.arange(N)
130+
self.df = DataFrame({'key': key, 'values': values})
131+
self.df.iloc[1, 1] = np.nan # insert missing data
169132

133+
def time_frame_nth_any(self, dtype):
134+
self.df.groupby('key').nth(0, dropna='any')
170135

171-
class NthObject(object):
136+
def time_groupby_nth_all(self, dtype):
137+
self.df.groupby('key').nth(0, dropna='all')
172138

173-
goal_time = 0.2
139+
def time_frame_nth(self, dtype):
140+
self.df.groupby('key').nth(0)
174141

175-
def setup_cache(self):
176-
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
177-
df['obj'] = ['a'] * 5000 + ['b'] * 5000
178-
return df
142+
def time_series_nth_any(self, dtype):
143+
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
179144

180-
def time_nth(self, df):
181-
df.groupby('g').nth(5)
145+
def time_groupby_nth_all(self, dtype):
146+
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
182147

183-
def time_nth_last(self, df):
184-
df.groupby('g').last()
148+
def time_series_nth(self, dtype):
149+
self.df['values'].groupby(self.df['key']).nth(0)
185150

186151

187152
class DateAttributes(object):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
243208
df.groupby(['key1', 'key2']).count()
244209

245210

246-
class CountInt(object):
211+
class CountMultiInt(object):
247212

248213
goal_time = 0.2
249214

@@ -255,18 +220,18 @@ def setup_cache(self):
255220
'ints2': np.random.randint(0, 1000, size=n)})
256221
return df
257222

258-
def time_int_count(self, df):
223+
def time_multi_int_count(self, df):
259224
df.groupby(['key1', 'key2']).count()
260225

261-
def time_int_nunique(self, df):
226+
def time_multi_int_nunique(self, df):
262227
df.groupby(['key1', 'key2']).nunique()
263228

264229

265230
class AggFunctions(object):
266231

267232
goal_time = 0.2
268233

269-
def setup_cache(self):
234+
def setup_cache():
270235
N = 10**5
271236
fac1 = np.array(['A', 'B', 'C'], dtype='O')
272237
fac2 = np.array(['one', 'two'], dtype='O')
@@ -361,9 +326,6 @@ def setup(self):
361326
def time_multi_size(self):
362327
self.df.groupby(['key1', 'key2']).size()
363328

364-
def time_dt_size(self):
365-
self.df.groupby(['dates']).size()
366-
367329
def time_dt_timegrouper_size(self):
368330
with warnings.catch_warnings(record=True):
369331
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
376338

377339
goal_time = 0.2
378340

379-
param_names = ['dtype', 'method']
380-
params = [['int', 'float', 'object'],
341+
param_names = ['dtype', 'method', 'application']
342+
params = [['int', 'float', 'object', 'datetime'],
381343
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
382344
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
383345
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
384346
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
385-
'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
347+
'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
348+
['direct', 'transformation']]
386349

387-
def setup(self, dtype, method):
350+
def setup(self, dtype, method, application):
388351
if method in method_blacklist.get(dtype, {}):
389352
raise NotImplementedError # skip benchmark
390353
ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
398361
np.random.random(ngroups) * 10.0])
399362
elif dtype == 'object':
400363
key = ['foo'] * size
364+
elif dtype == 'datetime':
365+
key = date_range('1/1/2011', periods=size, freq='s')
401366

402367
df = DataFrame({'values': values, 'key': key})
403-
self.df_groupby_method = getattr(df.groupby('key')['values'], method)
404368

405-
def time_method(self, dtype, method):
406-
self.df_groupby_method()
369+
if application == 'transform':
370+
if method == 'describe':
371+
raise NotImplementedError
372+
373+
self.as_group_method = lambda: df.groupby(
374+
'key')['values'].transform(method)
375+
self.as_field_method = lambda: df.groupby(
376+
'values')['key'].transform(method)
377+
else:
378+
self.as_group_method = getattr(df.groupby('key')['values'], method)
379+
self.as_field_method = getattr(df.groupby('values')['key'], method)
380+
381+
def time_dtype_as_group(self, dtype, method, application):
382+
self.as_group_method()
383+
384+
def time_dtype_as_field(self, dtype, method, application):
385+
self.as_field_method()
407386

408387

409388
class Float32(object):

0 commit comments

Comments
 (0)