Skip to content

PERF: performance improvements in multi-key groupby #9429

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 7, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ Performance
- Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`)
- Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`)
- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
- Performance improvements in multi-key ``groupby`` (:issue:`9429`)

Bug Fixes
~~~~~~~~~
Expand Down
37 changes: 17 additions & 20 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1217,11 +1217,9 @@ class BaseGrouper(object):
"""

def __init__(self, axis, groupings, sort=True, group_keys=True):
self.axis = axis
self.groupings = groupings
self.sort = sort
self.group_keys = group_keys
self.compressed = True
self._filter_empty_groups = self.compressed = len(groupings) != 1
self.axis, self.groupings, self.sort, self.group_keys = \
axis, groupings, sort, group_keys

@property
def shape(self):
Expand Down Expand Up @@ -1373,31 +1371,34 @@ def _get_compressed_labels(self):
return _compress_group_index(group_index)

ping = self.groupings[0]
self.compressed = False
self._filter_empty_groups = False

return ping.labels, np.arange(len(ping.group_index))

@cache_readonly
def ngroups(self):
return len(self.result_index)

@property
def recons_labels(self):
comp_ids, obs_ids, _ = self.group_info
labels = (ping.labels for ping in self.groupings)
return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels)

@cache_readonly
def result_index(self):
recons = self.get_group_levels()
return MultiIndex.from_arrays(recons, names=self.names)
if not self.compressed and len(self.groupings) == 1:
return self.groupings[0].group_index.rename(self.names[0])

def get_group_levels(self):
comp_ids, obs_ids, _ = self.group_info
return MultiIndex(levels=[ping.group_index for ping in self.groupings],
labels=self.recons_labels,
verify_integrity=False,
names=self.names)

def get_group_levels(self):
if not self.compressed and len(self.groupings) == 1:
return [self.groupings[0].group_index]

recons_labels = decons_obs_group_ids(comp_ids, obs_ids,
self.shape, (ping.labels for ping in self.groupings))

name_list = []
for ping, labels in zip(self.groupings, recons_labels):
for ping, labels in zip(self.groupings, self.recons_labels):
labels = com._ensure_platform_int(labels)
levels = ping.group_index.take(labels)

Expand Down Expand Up @@ -1432,8 +1433,6 @@ def get_group_levels(self):

_name_functions = {}

_filter_empty_groups = True

def _get_aggregate_function(self, how, values):

dtype_str = values.dtype.name
Expand Down Expand Up @@ -1797,8 +1796,6 @@ def size(self):
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
}

_filter_empty_groups = True

def _aggregate(self, result, counts, values, how, is_numeric=True):

agg_func, dtype = self._get_aggregate_function(how, values)
Expand Down
19 changes: 19 additions & 0 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,25 @@ def f(g):
groupby_int64_overflow = Benchmark("df.groupby(list('abcde')).max()", setup,
name='groupby_int64_overflow')


setup = common_setup + '''
from itertools import product
from string import ascii_letters, digits

n = 5 * 7 * 11 * (1 << 9)
alpha = list(map(''.join, product(ascii_letters + digits, repeat=4)))
f = lambda k: np.repeat(np.random.choice(alpha, n // k), k)

df = DataFrame({'a': f(11), 'b': f(7), 'c': f(5), 'd': f(1)})
df['joe'] = (np.random.randn(len(df)) * 10).round(3)

i = np.random.permutation(len(df))
df = df.iloc[i].reset_index(drop=True).copy()
'''

groupby_multi_index = Benchmark("df.groupby(list('abcd')).max()", setup,
name='groupby_multi_index')

#----------------------------------------------------------------------
# groupby with a variable value for ngroups

Expand Down