Skip to content

Commit c5a3514

Browse files
committed
PERF: speed up multi-key groupby
1 parent 86ecb99 commit c5a3514

File tree

3 files changed

+47
-11
lines changed

3 files changed

+47
-11
lines changed

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ Performance
486486
- Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`)
487487
- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`, :issue:`8073`)
488488
- Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`)
489-
489+
- Performance improvements in multi-key ``groupby`` (:issue:`8128`)
490490

491491

492492

pandas/core/groupby.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -1550,7 +1550,7 @@ def _aggregate_series_fast(self, obj, func):
15501550

15511551
# avoids object / Series creation overhead
15521552
dummy = obj._get_values(slice(None, 0)).to_dense()
1553-
indexer = _algos.groupsort_indexer(group_index, ngroups)[0]
1553+
indexer = _get_group_index_sorter(group_index, ngroups)
15541554
obj = obj.take(indexer, convert=False)
15551555
group_index = com.take_nd(group_index, indexer, allow_fill=False)
15561556
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
@@ -3262,7 +3262,7 @@ def slabels(self):
32623262
@cache_readonly
32633263
def sort_idx(self):
32643264
# Counting sort indexer
3265-
return _algos.groupsort_indexer(self.labels, self.ngroups)[0]
3265+
return _get_group_index_sorter(self.labels, self.ngroups)
32663266

32673267
def __iter__(self):
32683268
sdata = self._get_sorted_data()
@@ -3534,23 +3534,39 @@ def get_key(self, comp_id):
35343534

35353535

35363536
def _get_indices_dict(label_list, keys):
3537-
shape = [len(x) for x in keys]
3538-
group_index = get_group_index(label_list, shape)
3539-
3540-
sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index),
3541-
np.prod(shape))
3537+
shape = list(map(len, keys))
3538+
ngroups = np.prod(shape)
35423539

3543-
sorter_int = com._ensure_platform_int(sorter)
3540+
group_index = get_group_index(label_list, shape)
3541+
sorter = _get_group_index_sorter(group_index, ngroups)
35443542

3545-
sorted_labels = [lab.take(sorter_int) for lab in label_list]
3546-
group_index = group_index.take(sorter_int)
3543+
sorted_labels = [lab.take(sorter) for lab in label_list]
3544+
group_index = group_index.take(sorter)
35473545

35483546
return lib.indices_fast(sorter, group_index, keys, sorted_labels)
35493547

35503548

35513549
#----------------------------------------------------------------------
35523550
# sorting levels...cleverly?
35533551

3552+
def _get_group_index_sorter(group_index, ngroups):
3553+
'''
3554+
_algos.groupsort_indexer is at least O(ngroups), where
3555+
ngroups = prod(shape)
3556+
shape = map(len, keys)
3557+
that is, linear in the number of combinations (cartesian product) of unique
3558+
values of groupby keys. This can be huge when doing multi-key groupby.
3559+
np.argsort is (persumambly) O(count x log(count)) where count is the length
3560+
of the data-frame;
3561+
'''
3562+
count = len(group_index)
3563+
if ngroups < count * np.log(count): # taking complexities literally
3564+
sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index),
3565+
ngroups)
3566+
return com._ensure_platform_int(sorter)
3567+
else:
3568+
return group_index.argsort()
3569+
35543570

35553571
def _compress_group_index(group_index, sort=True):
35563572
"""

vb_suite/groupby.py

+20
Original file line numberDiff line numberDiff line change
@@ -454,3 +454,23 @@ def f(g):
454454
"""
455455

456456
groupby_transform_series2 = Benchmark("df.groupby('id')['val'].transform(np.mean)", setup)
457+
458+
setup = common_setup + '''
459+
np.random.seed(2718281)
460+
n = 20000
461+
df = DataFrame(np.random.randint(1, n, (n, 3)),
462+
columns=['jim', 'joe', 'jolie'])
463+
'''
464+
465+
stmt = "df.groupby(['jim', 'joe'])['jolie'].transform('max')";
466+
groupby_transform_multi_key1 = Benchmark(stmt, setup)
467+
groupby_transform_multi_key2 = Benchmark(stmt, setup + "df['jim'] = df['joe']")
468+
469+
setup = common_setup + '''
470+
np.random.seed(2718281)
471+
n = 200000
472+
df = DataFrame(np.random.randint(1, n / 10, (n, 3)),
473+
columns=['jim', 'joe', 'jolie'])
474+
'''
475+
groupby_transform_multi_key3 = Benchmark(stmt, setup)
476+
groupby_transform_multi_key4 = Benchmark(stmt, setup + "df['jim'] = df['joe']")

0 commit comments

Comments
 (0)