Merge branch 'groupby-speed-up' of https://github.com/behzadnouri/pandas into behzadnouri-groupby-speed-up

jreback · jreback · commit b82a4e694c63 · 2014-08-29T15:16:06.000-04:00
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -488,7 +488,7 @@ Performance
 - Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`)
 - Performance improvements in ``StataReader`` when reading large files (:issue:`8040`, :issue:`8073`)
 - Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`)
-
+- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
 
 
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1559,7 +1559,7 @@ def _aggregate_series_fast(self, obj, func):
 
         # avoids object / Series creation overhead
         dummy = obj._get_values(slice(None, 0)).to_dense()
-        indexer = _algos.groupsort_indexer(group_index, ngroups)[0]
+        indexer = _get_group_index_sorter(group_index, ngroups)
         obj = obj.take(indexer, convert=False)
         group_index = com.take_nd(group_index, indexer, allow_fill=False)
         grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
@@ -3271,7 +3271,7 @@ def slabels(self):
     @cache_readonly
     def sort_idx(self):
         # Counting sort indexer
-        return _algos.groupsort_indexer(self.labels, self.ngroups)[0]
+        return _get_group_index_sorter(self.labels, self.ngroups)
 
     def __iter__(self):
         sdata = self._get_sorted_data()
@@ -3543,23 +3543,39 @@ def get_key(self, comp_id):
 
 
 def _get_indices_dict(label_list, keys):
-    shape = [len(x) for x in keys]
-    group_index = get_group_index(label_list, shape)
-
-    sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index),
-                                         np.prod(shape))
+    shape = list(map(len, keys))
+    ngroups = np.prod(shape)
 
-    sorter_int = com._ensure_platform_int(sorter)
+    group_index = get_group_index(label_list, shape)
+    sorter = _get_group_index_sorter(group_index, ngroups)
 
-    sorted_labels = [lab.take(sorter_int) for lab in label_list]
-    group_index = group_index.take(sorter_int)
+    sorted_labels = [lab.take(sorter) for lab in label_list]
+    group_index = group_index.take(sorter)
 
     return lib.indices_fast(sorter, group_index, keys, sorted_labels)
 
 
 #----------------------------------------------------------------------
 # sorting levels...cleverly?
 
+def _get_group_index_sorter(group_index, ngroups):
+    """
+    _algos.groupsort_indexer is at least O(ngroups), where
+        ngroups = prod(shape)
+        shape = map(len, keys)
+    that is, linear in the number of combinations (cartesian product) of unique
+    values of groupby keys. This can be huge when doing multi-key groupby.
+    np.argsort is O(count)^2 when using quicksort (the default) where count is the length
+    of the data-frame;
+    """
+    count = len(group_index)
+    if ngroups < count * np.log(count): # taking complexities literally
+        sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index),
+                                             ngroups)
+        return com._ensure_platform_int(sorter)
+    else:
+        return group_index.argsort()
+
 
 def _compress_group_index(group_index, sort=True):
     """
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -454,3 +454,23 @@ def f(g):
 """
 
 groupby_transform_series2 = Benchmark("df.groupby('id')['val'].transform(np.mean)", setup)
+
+setup = common_setup + '''
+np.random.seed(2718281)
+n = 20000
+df = DataFrame(np.random.randint(1, n, (n, 3)),
+        columns=['jim', 'joe', 'jolie'])
+'''
+
+stmt = "df.groupby(['jim', 'joe'])['jolie'].transform('max')";
+groupby_transform_multi_key1 = Benchmark(stmt, setup)
+groupby_transform_multi_key2 = Benchmark(stmt, setup + "df['jim'] = df['joe']")
+
+setup = common_setup + '''
+np.random.seed(2718281)
+n = 200000
+df = DataFrame(np.random.randint(1, n / 10, (n, 3)),
+        columns=['jim', 'joe', 'jolie'])
+'''
+groupby_transform_multi_key3 = Benchmark(stmt, setup)
+groupby_transform_multi_key4 = Benchmark(stmt, setup + "df['jim'] = df['joe']")