@@ -1559,7 +1559,7 @@ def _aggregate_series_fast(self, obj, func):
1559
1559
1560
1560
# avoids object / Series creation overhead
1561
1561
dummy = obj ._get_values (slice (None , 0 )).to_dense ()
1562
- indexer = _algos . groupsort_indexer (group_index , ngroups )[ 0 ]
1562
+ indexer = _get_group_index_sorter (group_index , ngroups )
1563
1563
obj = obj .take (indexer , convert = False )
1564
1564
group_index = com .take_nd (group_index , indexer , allow_fill = False )
1565
1565
grouper = lib .SeriesGrouper (obj , func , group_index , ngroups ,
@@ -3271,7 +3271,7 @@ def slabels(self):
3271
3271
@cache_readonly
3272
3272
def sort_idx (self ):
3273
3273
# Counting sort indexer
3274
- return _algos . groupsort_indexer (self .labels , self .ngroups )[ 0 ]
3274
+ return _get_group_index_sorter (self .labels , self .ngroups )
3275
3275
3276
3276
def __iter__ (self ):
3277
3277
sdata = self ._get_sorted_data ()
@@ -3543,23 +3543,39 @@ def get_key(self, comp_id):
3543
3543
3544
3544
3545
3545
def _get_indices_dict (label_list , keys ):
3546
- shape = [len (x ) for x in keys ]
3547
- group_index = get_group_index (label_list , shape )
3548
-
3549
- sorter , _ = _algos .groupsort_indexer (com ._ensure_int64 (group_index ),
3550
- np .prod (shape ))
3546
+ shape = list (map (len , keys ))
3547
+ ngroups = np .prod (shape )
3551
3548
3552
- sorter_int = com ._ensure_platform_int (sorter )
3549
+ group_index = get_group_index (label_list , shape )
3550
+ sorter = _get_group_index_sorter (group_index , ngroups )
3553
3551
3554
- sorted_labels = [lab .take (sorter_int ) for lab in label_list ]
3555
- group_index = group_index .take (sorter_int )
3552
+ sorted_labels = [lab .take (sorter ) for lab in label_list ]
3553
+ group_index = group_index .take (sorter )
3556
3554
3557
3555
return lib .indices_fast (sorter , group_index , keys , sorted_labels )
3558
3556
3559
3557
3560
3558
#----------------------------------------------------------------------
3561
3559
# sorting levels...cleverly?
3562
3560
3561
+ def _get_group_index_sorter (group_index , ngroups ):
3562
+ """
3563
+ _algos.groupsort_indexer is at least O(ngroups), where
3564
+ ngroups = prod(shape)
3565
+ shape = map(len, keys)
3566
+ that is, linear in the number of combinations (cartesian product) of unique
3567
+ values of groupby keys. This can be huge when doing multi-key groupby.
3568
+ np.argsort is O(count)^2 when using quicksort (the default) where count is the length
3569
+ of the data-frame;
3570
+ """
3571
+ count = len (group_index )
3572
+ if ngroups < count * np .log (count ): # taking complexities literally
3573
+ sorter , _ = _algos .groupsort_indexer (com ._ensure_int64 (group_index ),
3574
+ ngroups )
3575
+ return com ._ensure_platform_int (sorter )
3576
+ else :
3577
+ return group_index .argsort ()
3578
+
3563
3579
3564
3580
def _compress_group_index (group_index , sort = True ):
3565
3581
"""
0 commit comments