Skip to content

Commit b82a4e6

Browse files
committed
Merge branch 'groupby-speed-up' of https://github.com/behzadnouri/pandas into behzadnouri-groupby-speed-up
2 parents 3bb0803 + c5a3514 commit b82a4e6

File tree

3 files changed

+47
-11
lines changed

3 files changed

+47
-11
lines changed

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ Performance
488488
- Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`)
489489
- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`, :issue:`8073`)
490490
- Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`)
491-
491+
- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
492492

493493

494494

pandas/core/groupby.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -1559,7 +1559,7 @@ def _aggregate_series_fast(self, obj, func):
15591559

15601560
# avoids object / Series creation overhead
15611561
dummy = obj._get_values(slice(None, 0)).to_dense()
1562-
indexer = _algos.groupsort_indexer(group_index, ngroups)[0]
1562+
indexer = _get_group_index_sorter(group_index, ngroups)
15631563
obj = obj.take(indexer, convert=False)
15641564
group_index = com.take_nd(group_index, indexer, allow_fill=False)
15651565
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
@@ -3271,7 +3271,7 @@ def slabels(self):
32713271
@cache_readonly
32723272
def sort_idx(self):
32733273
# Counting sort indexer
3274-
return _algos.groupsort_indexer(self.labels, self.ngroups)[0]
3274+
return _get_group_index_sorter(self.labels, self.ngroups)
32753275

32763276
def __iter__(self):
32773277
sdata = self._get_sorted_data()
@@ -3543,23 +3543,39 @@ def get_key(self, comp_id):
35433543

35443544

35453545
def _get_indices_dict(label_list, keys):
3546-
shape = [len(x) for x in keys]
3547-
group_index = get_group_index(label_list, shape)
3548-
3549-
sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index),
3550-
np.prod(shape))
3546+
shape = list(map(len, keys))
3547+
ngroups = np.prod(shape)
35513548

3552-
sorter_int = com._ensure_platform_int(sorter)
3549+
group_index = get_group_index(label_list, shape)
3550+
sorter = _get_group_index_sorter(group_index, ngroups)
35533551

3554-
sorted_labels = [lab.take(sorter_int) for lab in label_list]
3555-
group_index = group_index.take(sorter_int)
3552+
sorted_labels = [lab.take(sorter) for lab in label_list]
3553+
group_index = group_index.take(sorter)
35563554

35573555
return lib.indices_fast(sorter, group_index, keys, sorted_labels)
35583556

35593557

35603558
#----------------------------------------------------------------------
35613559
# sorting levels...cleverly?
35623560

3561+
def _get_group_index_sorter(group_index, ngroups):
3562+
"""
3563+
_algos.groupsort_indexer is at least O(ngroups), where
3564+
ngroups = prod(shape)
3565+
shape = map(len, keys)
3566+
that is, linear in the number of combinations (cartesian product) of unique
3567+
values of groupby keys. This can be huge when doing multi-key groupby.
3568+
np.argsort is O(count)^2 when using quicksort (the default) where count is the length
3569+
of the data-frame;
3570+
"""
3571+
count = len(group_index)
3572+
if ngroups < count * np.log(count): # taking complexities literally
3573+
sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index),
3574+
ngroups)
3575+
return com._ensure_platform_int(sorter)
3576+
else:
3577+
return group_index.argsort()
3578+
35633579

35643580
def _compress_group_index(group_index, sort=True):
35653581
"""

vb_suite/groupby.py

+20
Original file line numberDiff line numberDiff line change
@@ -454,3 +454,23 @@ def f(g):
454454
"""
455455

456456
groupby_transform_series2 = Benchmark("df.groupby('id')['val'].transform(np.mean)", setup)
457+
458+
setup = common_setup + '''
459+
np.random.seed(2718281)
460+
n = 20000
461+
df = DataFrame(np.random.randint(1, n, (n, 3)),
462+
columns=['jim', 'joe', 'jolie'])
463+
'''
464+
465+
stmt = "df.groupby(['jim', 'joe'])['jolie'].transform('max')";
466+
groupby_transform_multi_key1 = Benchmark(stmt, setup)
467+
groupby_transform_multi_key2 = Benchmark(stmt, setup + "df['jim'] = df['joe']")
468+
469+
setup = common_setup + '''
470+
np.random.seed(2718281)
471+
n = 200000
472+
df = DataFrame(np.random.randint(1, n / 10, (n, 3)),
473+
columns=['jim', 'joe', 'jolie'])
474+
'''
475+
groupby_transform_multi_key3 = Benchmark(stmt, setup)
476+
groupby_transform_multi_key4 = Benchmark(stmt, setup + "df['jim'] = df['joe']")

0 commit comments

Comments
 (0)