@@ -1550,7 +1550,7 @@ def _aggregate_series_fast(self, obj, func):
1550
1550
1551
1551
# avoids object / Series creation overhead
1552
1552
dummy = obj ._get_values (slice (None , 0 )).to_dense ()
1553
- indexer = _algos . groupsort_indexer (group_index , ngroups )[ 0 ]
1553
+ indexer = _get_group_index_sorter (group_index , ngroups )
1554
1554
obj = obj .take (indexer , convert = False )
1555
1555
group_index = com .take_nd (group_index , indexer , allow_fill = False )
1556
1556
grouper = lib .SeriesGrouper (obj , func , group_index , ngroups ,
@@ -3262,7 +3262,7 @@ def slabels(self):
3262
3262
@cache_readonly
3263
3263
def sort_idx (self ):
3264
3264
# Counting sort indexer
3265
- return _algos . groupsort_indexer (self .labels , self .ngroups )[ 0 ]
3265
+ return _get_group_index_sorter (self .labels , self .ngroups )
3266
3266
3267
3267
def __iter__ (self ):
3268
3268
sdata = self ._get_sorted_data ()
@@ -3534,23 +3534,39 @@ def get_key(self, comp_id):
3534
3534
3535
3535
3536
3536
def _get_indices_dict (label_list , keys ):
3537
- shape = [len (x ) for x in keys ]
3538
- group_index = get_group_index (label_list , shape )
3539
-
3540
- sorter , _ = _algos .groupsort_indexer (com ._ensure_int64 (group_index ),
3541
- np .prod (shape ))
3537
+ shape = list (map (len , keys ))
3538
+ ngroups = np .prod (shape )
3542
3539
3543
- sorter_int = com ._ensure_platform_int (sorter )
3540
+ group_index = get_group_index (label_list , shape )
3541
+ sorter = _get_group_index_sorter (group_index , ngroups )
3544
3542
3545
- sorted_labels = [lab .take (sorter_int ) for lab in label_list ]
3546
- group_index = group_index .take (sorter_int )
3543
+ sorted_labels = [lab .take (sorter ) for lab in label_list ]
3544
+ group_index = group_index .take (sorter )
3547
3545
3548
3546
return lib .indices_fast (sorter , group_index , keys , sorted_labels )
3549
3547
3550
3548
3551
3549
#----------------------------------------------------------------------
3552
3550
# sorting levels...cleverly?
3553
3551
3552
+ def _get_group_index_sorter (group_index , ngroups ):
3553
+ '''
3554
+ _algos.groupsort_indexer is at least O(ngroups), where
3555
+ ngroups = prod(shape)
3556
+ shape = map(len, keys)
3557
+ that is, linear in the number of combinations (cartesian product) of unique
3558
+ values of groupby keys. This can be huge when doing multi-key groupby.
3559
+ np.argsort is (persumambly) O(count x log(count)) where count is the length
3560
+ of the data-frame;
3561
+ '''
3562
+ count = len (group_index )
3563
+ if ngroups < count * np .log (count ): # taking complexities literally
3564
+ sorter , _ = _algos .groupsort_indexer (com ._ensure_int64 (group_index ),
3565
+ ngroups )
3566
+ return com ._ensure_platform_int (sorter )
3567
+ else :
3568
+ return group_index .argsort ()
3569
+
3554
3570
3555
3571
def _compress_group_index (group_index , sort = True ):
3556
3572
"""
0 commit comments