1
1
from itertools import izip
2
2
import types
3
-
4
3
import numpy as np
5
4
6
5
from pandas .core .frame import DataFrame
@@ -413,9 +412,10 @@ class Grouper(object):
413
412
"""
414
413
415
414
"""
416
- def __init__ (self , axis , groupings ):
415
+ def __init__ (self , axis , groupings , sort = True ):
417
416
self .axis = axis
418
417
self .groupings = groupings
418
+ self .sort = sort
419
419
420
420
@property
421
421
def shape (self ):
@@ -507,19 +507,56 @@ def groups(self):
507
507
508
508
@cache_readonly
509
509
def group_info (self ):
510
- if len (self .groupings ) > 1 :
511
- all_labels = [ping .labels for ping in self .groupings ]
512
- group_index = get_group_index (all_labels , self .shape )
513
- comp_ids , obs_group_ids = _compress_group_index (group_index )
514
- else :
515
- ping = self .groupings [0 ]
516
- group_index = ping .labels
510
+ comp_ids , obs_group_ids = self ._get_compressed_labels ()
517
511
518
- comp_ids , obs_group_ids = _compress_group_index (group_index )
519
512
ngroups = len (obs_group_ids )
520
513
comp_ids = com ._ensure_int32 (comp_ids )
521
514
return comp_ids , obs_group_ids , ngroups
522
515
516
+ def _get_compressed_labels (self ):
517
+ all_labels = [ping .labels for ping in self .groupings ]
518
+ if self ._overflow_possible :
519
+ tups = lib .fast_zip (all_labels )
520
+ labs , uniques , _ = algos .factorize (tups )
521
+
522
+ if self .sort :
523
+ uniques , labs = _reorder_by_uniques (uniques , labs )
524
+
525
+ return labs , uniques
526
+ else :
527
+ if len (all_labels ) > 1 :
528
+ group_index = get_group_index (all_labels , self .shape )
529
+ else :
530
+ group_index = all_labels [0 ]
531
+ comp_ids , obs_group_ids = _compress_group_index (group_index )
532
+ return comp_ids , obs_group_ids
533
+
534
+ @cache_readonly
535
+ def _overflow_possible (self ):
536
+ return _int64_overflow_possible (self .shape )
537
+
538
+ @cache_readonly
539
+ def result_index (self ):
540
+ recons = self .get_group_levels ()
541
+ return MultiIndex .from_arrays (recons , names = self .names )
542
+
543
+ def get_group_levels (self ):
544
+ obs_ids = self .group_info [1 ]
545
+ if self ._overflow_possible :
546
+ recons_labels = [np .array (x ) for x in izip (* obs_ids )]
547
+ else :
548
+ recons_labels = decons_group_index (obs_ids , self .shape )
549
+
550
+ name_list = []
551
+ for ping , labels in zip (self .groupings , recons_labels ):
552
+ labels = com ._ensure_platform_int (labels )
553
+ name_list .append (ping .group_index .take (labels ))
554
+
555
+ return name_list
556
+
557
+ #------------------------------------------------------------
558
+ # Aggregation functions
559
+
523
560
_cython_functions = {
524
561
'add' : lib .group_add ,
525
562
'mean' : lib .group_mean ,
@@ -603,22 +640,6 @@ def _aggregate_series_pure_python(self, obj, func):
603
640
result = lib .maybe_convert_objects (result , try_float = 0 )
604
641
return result , counts
605
642
606
- @cache_readonly
607
- def result_index (self ):
608
- recons = self .get_group_levels ()
609
- return MultiIndex .from_arrays (recons , names = self .names )
610
-
611
- def get_group_levels (self ):
612
- obs_ids = self .group_info [1 ]
613
- recons_labels = decons_group_index (obs_ids , self .shape )
614
-
615
- name_list = []
616
- for ping , labels in zip (self .groupings , recons_labels ):
617
- labels = com ._ensure_platform_int (labels )
618
- name_list .append (ping .group_index .take (labels ))
619
-
620
- return name_list
621
-
622
643
623
644
class Grouping (object ):
624
645
"""
@@ -793,7 +814,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
793
814
ping .name = 'key_%d' % i
794
815
groupings .append (ping )
795
816
796
- grouper = Grouper (group_axis , groupings )
817
+ grouper = Grouper (group_axis , groupings , sort = sort )
797
818
798
819
return grouper , exclusions
799
820
@@ -1470,24 +1491,21 @@ def get_group_index(label_list, shape):
1470
1491
n = len (label_list [0 ])
1471
1492
group_index = np .zeros (n , dtype = np .int64 )
1472
1493
mask = np .zeros (n , dtype = bool )
1473
-
1474
- if _int64_overflow_possible (shape ):
1475
- raise Exception ('Possible int64 overflow, raise exception for now' )
1476
- else :
1477
- for i in xrange (len (shape )):
1478
- stride = np .prod ([x for x in shape [i + 1 :]], dtype = np .int64 )
1479
- group_index += com ._ensure_int64 (label_list [i ]) * stride
1480
- mask |= label_list [i ] < 0
1494
+ for i in xrange (len (shape )):
1495
+ stride = np .prod ([x for x in shape [i + 1 :]], dtype = np .int64 )
1496
+ group_index += com ._ensure_int64 (label_list [i ]) * stride
1497
+ mask |= label_list [i ] < 0
1481
1498
1482
1499
np .putmask (group_index , mask , - 1 )
1483
1500
return group_index
1484
1501
1502
+ _INT64_MAX = np .iinfo (np .int64 ).max
1485
1503
def _int64_overflow_possible (shape ):
1486
1504
the_prod = 1L
1487
1505
for x in shape :
1488
1506
the_prod *= long (x )
1489
1507
1490
- return the_prod >= 2 ** 63
1508
+ return the_prod >= _INT64_MAX
1491
1509
1492
1510
def decons_group_index (comp_labels , shape ):
1493
1511
# reconstruct labels
@@ -1504,6 +1522,39 @@ def decons_group_index(comp_labels, shape):
1504
1522
return label_list [::- 1 ]
1505
1523
1506
1524
1525
+ def _indexer_from_factorized (labels , shape , compress = True ):
1526
+ if _int64_overflow_possible (shape ):
1527
+ indexer = np .lexsort (np .array (labels [::- 1 ]))
1528
+ return indexer
1529
+
1530
+ group_index = get_group_index (labels , shape )
1531
+
1532
+ if compress :
1533
+ comp_ids , obs_ids = _compress_group_index (group_index )
1534
+ max_group = len (obs_ids )
1535
+ else :
1536
+ comp_ids = group_index
1537
+ max_group = np .prod (shape )
1538
+
1539
+ indexer , _ = lib .groupsort_indexer (comp_ids .astype ('i4' ), max_group )
1540
+
1541
+ return indexer
1542
+
1543
+
1544
+ def _lexsort_indexer (keys ):
1545
+ labels = []
1546
+ shape = []
1547
+ for key in keys :
1548
+ rizer = lib .Factorizer (len (key ))
1549
+
1550
+ if not key .dtype == np .object_ :
1551
+ key = key .astype ('O' )
1552
+
1553
+ ids , _ = rizer .factorize (key , sort = True )
1554
+ labels .append (ids )
1555
+ shape .append (len (rizer .uniques ))
1556
+ return _indexer_from_factorized (labels , shape )
1557
+
1507
1558
class _KeyMapper (object ):
1508
1559
"""
1509
1560
Ease my suffering. Map compressed group id -> key tuple
@@ -1548,23 +1599,29 @@ def _compress_group_index(group_index, sort=True):
1548
1599
obs_group_ids = np .array (uniques , dtype = 'i8' )
1549
1600
1550
1601
if sort and len (obs_group_ids ) > 0 :
1551
- # sorter is index where elements ought to go
1552
- sorter = obs_group_ids . argsort ( )
1602
+ obs_group_ids , comp_ids = _reorder_by_uniques ( obs_group_ids ,
1603
+ comp_ids )
1553
1604
1554
- # reverse_indexer is where elements came from
1555
- reverse_indexer = np .empty (len (sorter ), dtype = 'i4' )
1556
- reverse_indexer .put (sorter , np .arange (len (sorter )))
1605
+ return comp_ids , obs_group_ids
1557
1606
1558
- mask = comp_ids < 0
1607
+ def _reorder_by_uniques (uniques , labels ):
1608
+ # sorter is index where elements ought to go
1609
+ sorter = uniques .argsort ()
1559
1610
1560
- # move comp_ids to right locations (ie, unsort ascending labels)
1561
- comp_ids = reverse_indexer . take ( comp_ids )
1562
- np . putmask ( comp_ids , mask , - 1 )
1611
+ # reverse_indexer is where elements came from
1612
+ reverse_indexer = np . empty ( len ( sorter ), dtype = 'i4' )
1613
+ reverse_indexer . put ( sorter , np . arange ( len ( sorter )) )
1563
1614
1564
- # sort observed ids
1565
- obs_group_ids = obs_group_ids .take (sorter )
1615
+ mask = labels < 0
1566
1616
1567
- return comp_ids , obs_group_ids
1617
+ # move labels to right locations (ie, unsort ascending labels)
1618
+ labels = reverse_indexer .take (labels )
1619
+ np .putmask (labels , mask , - 1 )
1620
+
1621
+ # sort observed ids
1622
+ uniques = uniques .take (sorter )
1623
+
1624
+ return uniques , labels
1568
1625
1569
1626
def _groupby_indices (values ):
1570
1627
if values .dtype != np .object_ :
0 commit comments