Skip to content

Commit 43c1007

Browse files
committed
Merge pull request #9429 from behzadnouri/grby-mi
PERF: performance improvements in multi-key groupby
2 parents daed53b + 0a956bb commit 43c1007

File tree

3 files changed

+37
-20
lines changed

3 files changed

+37
-20
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ Performance
174174
- Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`)
175175
- Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`)
176176
- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
177+
- Performance improvements in multi-key ``groupby`` (:issue:`9429`)
177178

178179
Bug Fixes
179180
~~~~~~~~~

pandas/core/groupby.py

+17-20
Original file line numberDiff line numberDiff line change
@@ -1217,11 +1217,9 @@ class BaseGrouper(object):
12171217
"""
12181218

12191219
def __init__(self, axis, groupings, sort=True, group_keys=True):
1220-
self.axis = axis
1221-
self.groupings = groupings
1222-
self.sort = sort
1223-
self.group_keys = group_keys
1224-
self.compressed = True
1220+
self._filter_empty_groups = self.compressed = len(groupings) != 1
1221+
self.axis, self.groupings, self.sort, self.group_keys = \
1222+
axis, groupings, sort, group_keys
12251223

12261224
@property
12271225
def shape(self):
@@ -1373,31 +1371,34 @@ def _get_compressed_labels(self):
13731371
return _compress_group_index(group_index)
13741372

13751373
ping = self.groupings[0]
1376-
self.compressed = False
1377-
self._filter_empty_groups = False
1378-
13791374
return ping.labels, np.arange(len(ping.group_index))
13801375

13811376
@cache_readonly
13821377
def ngroups(self):
13831378
return len(self.result_index)
13841379

1380+
@property
1381+
def recons_labels(self):
1382+
comp_ids, obs_ids, _ = self.group_info
1383+
labels = (ping.labels for ping in self.groupings)
1384+
return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels)
1385+
13851386
@cache_readonly
13861387
def result_index(self):
1387-
recons = self.get_group_levels()
1388-
return MultiIndex.from_arrays(recons, names=self.names)
1388+
if not self.compressed and len(self.groupings) == 1:
1389+
return self.groupings[0].group_index.rename(self.names[0])
13891390

1390-
def get_group_levels(self):
1391-
comp_ids, obs_ids, _ = self.group_info
1391+
return MultiIndex(levels=[ping.group_index for ping in self.groupings],
1392+
labels=self.recons_labels,
1393+
verify_integrity=False,
1394+
names=self.names)
13921395

1396+
def get_group_levels(self):
13931397
if not self.compressed and len(self.groupings) == 1:
13941398
return [self.groupings[0].group_index]
13951399

1396-
recons_labels = decons_obs_group_ids(comp_ids, obs_ids,
1397-
self.shape, (ping.labels for ping in self.groupings))
1398-
13991400
name_list = []
1400-
for ping, labels in zip(self.groupings, recons_labels):
1401+
for ping, labels in zip(self.groupings, self.recons_labels):
14011402
labels = com._ensure_platform_int(labels)
14021403
levels = ping.group_index.take(labels)
14031404

@@ -1432,8 +1433,6 @@ def get_group_levels(self):
14321433

14331434
_name_functions = {}
14341435

1435-
_filter_empty_groups = True
1436-
14371436
def _get_aggregate_function(self, how, values):
14381437

14391438
dtype_str = values.dtype.name
@@ -1797,8 +1796,6 @@ def size(self):
17971796
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
17981797
}
17991798

1800-
_filter_empty_groups = True
1801-
18021799
def _aggregate(self, result, counts, values, how, is_numeric=True):
18031800

18041801
agg_func, dtype = self._get_aggregate_function(how, values)

vb_suite/groupby.py

+19
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,25 @@ def f(g):
501501
groupby_int64_overflow = Benchmark("df.groupby(list('abcde')).max()", setup,
502502
name='groupby_int64_overflow')
503503

504+
505+
setup = common_setup + '''
506+
from itertools import product
507+
from string import ascii_letters, digits
508+
509+
n = 5 * 7 * 11 * (1 << 9)
510+
alpha = list(map(''.join, product(ascii_letters + digits, repeat=4)))
511+
f = lambda k: np.repeat(np.random.choice(alpha, n // k), k)
512+
513+
df = DataFrame({'a': f(11), 'b': f(7), 'c': f(5), 'd': f(1)})
514+
df['joe'] = (np.random.randn(len(df)) * 10).round(3)
515+
516+
i = np.random.permutation(len(df))
517+
df = df.iloc[i].reset_index(drop=True).copy()
518+
'''
519+
520+
groupby_multi_index = Benchmark("df.groupby(list('abcd')).max()", setup,
521+
name='groupby_multi_index')
522+
504523
#----------------------------------------------------------------------
505524
# groupby with a variable value for ngroups
506525

0 commit comments

Comments
 (0)