@@ -85,7 +85,8 @@ class GroupBy(object):
85
85
"""
86
86
87
87
def __init__ (self , obj , grouper = None , axis = 0 , level = None ,
88
- groupings = None , exclusions = None , column = None , as_index = True ):
88
+ groupings = None , exclusions = None , column = None , as_index = True ,
89
+ sort = True ):
89
90
self ._column = column
90
91
91
92
if isinstance (obj , NDFrame ):
@@ -105,10 +106,11 @@ def __init__(self, obj, grouper=None, axis=0, level=None,
105
106
106
107
self .as_index = as_index
107
108
self .grouper = grouper
109
+ self .sort = sort
108
110
109
111
if groupings is None :
110
112
groupings , exclusions = _get_groupings (obj , grouper , axis = axis ,
111
- level = level )
113
+ level = level , sort = sort )
112
114
113
115
self .groupings = groupings
114
116
self .exclusions = set (exclusions ) if exclusions else set ()
@@ -132,6 +134,7 @@ def indices(self):
132
134
if len (self .groupings ) == 1 :
133
135
return self .primary .indices
134
136
else :
137
+ # TODO: this is massively inefficient
135
138
to_groupby = zip (* (ping .grouper for ping in self .groupings ))
136
139
to_groupby = Index (to_groupby )
137
140
return lib .groupby_indices (to_groupby )
@@ -149,7 +152,7 @@ def _obj_with_exclusions(self):
149
152
150
153
@property
151
154
def _group_shape (self ):
152
- return tuple (len ( ping .counts ) for ping in self .groupings )
155
+ return tuple (ping .ngroups for ping in self .groupings )
153
156
154
157
def __getattr__ (self , attr ):
155
158
if hasattr (self .obj , attr ):
@@ -525,11 +528,13 @@ class Grouping(object):
525
528
* group_index : unique groups
526
529
* groups : dict of {group -> label_list}
527
530
"""
528
- def __init__ (self , index , grouper = None , name = None , level = None ):
531
+ def __init__ (self , index , grouper = None , name = None , level = None ,
532
+ sort = True ):
529
533
self .name = name
530
534
self .level = level
531
535
self .grouper = _convert_grouper (index , grouper )
532
536
self .index = index
537
+ self .sort = sort
533
538
534
539
# right place for this?
535
540
if isinstance (grouper , Series ) and name is None :
@@ -576,6 +581,10 @@ def __iter__(self):
576
581
_counts = None
577
582
_group_index = None
578
583
584
+ @property
585
+ def ngroups (self ):
586
+ return len (self .group_index )
587
+
579
588
@cache_readonly
580
589
def indices (self ):
581
590
return _groupby_indices (self .grouper )
@@ -589,38 +598,58 @@ def labels(self):
589
598
@property
590
599
def ids (self ):
591
600
if self ._ids is None :
592
- if self ._was_factor :
593
- index = self ._group_index
594
- self ._ids = dict (zip (range (len (index )), index ))
595
- else :
596
- self ._make_labels ()
601
+ index = self .group_index
602
+ self ._ids = dict (zip (range (len (index )), index ))
597
603
return self ._ids
598
604
599
605
@property
600
606
def counts (self ):
601
607
if self ._counts is None :
602
- self ._make_labels ()
608
+ if self ._was_factor :
609
+ self ._counts = lib .group_count (self .labels , self .ngroups )
610
+ else :
611
+ self ._make_labels ()
603
612
return self ._counts
604
613
605
614
@property
606
615
def group_index (self ):
607
616
if self ._group_index is None :
608
- ids = self .ids
609
- values = np .arange (len (self .ids ), dtype = 'O' )
610
- self ._group_index = Index (lib .lookup_values (values , ids ),
611
- name = self .name )
617
+ self ._make_labels ()
618
+
619
+ # ids = self.ids
620
+ # values = np.arange(len(self.ids), dtype='O')
621
+ # self._group_index = Index(lib.lookup_values(values, ids),
622
+ # name=self.name)
612
623
return self ._group_index
613
624
614
625
def _make_labels (self ):
615
626
if self ._was_factor : # pragma: no cover
616
627
raise Exception ('Should not call this method grouping by level' )
617
628
else :
618
- ids , labels , counts = _group_labels (self .grouper )
619
- sids , slabels , scounts = sort_group_labels (ids , labels , counts )
629
+ values = self .grouper
630
+ if values .dtype != np .object_ :
631
+ values = values .astype ('O' )
632
+
633
+ # khash
634
+ rizer = lib .Factorizer (len (values ))
635
+ labels , counts = rizer .factorize (values , sort = False )
636
+
637
+ uniques = Index (rizer .uniques , name = self .name )
638
+ if self .sort and len (counts ) > 0 :
639
+ sorter = uniques .argsort ()
640
+ reverse_indexer = np .empty (len (sorter ), dtype = np .int32 )
641
+ reverse_indexer .put (sorter , np .arange (len (sorter )))
642
+
643
+ mask = labels < 0
644
+ labels = reverse_indexer .take (labels )
645
+ np .putmask (labels , mask , - 1 )
620
646
621
- self ._labels = slabels
622
- self ._ids = sids
623
- self ._counts = scounts
647
+ uniques = uniques .take (sorter )
648
+ counts = counts .take (sorter )
649
+
650
+ self ._labels = labels
651
+ self ._group_index = uniques
652
+ self ._counts = counts
624
653
625
654
_groups = None
626
655
@property
@@ -629,7 +658,8 @@ def groups(self):
629
658
self ._groups = self .index .groupby (self .grouper )
630
659
return self ._groups
631
660
632
- def _get_groupings (obj , grouper = None , axis = 0 , level = None ):
661
+
662
+ def _get_groupings (obj , grouper = None , axis = 0 , level = None , sort = True ):
633
663
group_axis = obj ._get_axis (axis )
634
664
635
665
if level is not None and not isinstance (group_axis , MultiIndex ):
@@ -655,7 +685,7 @@ def _get_groupings(obj, grouper=None, axis=0, level=None):
655
685
exclusions .append (gpr )
656
686
name = gpr
657
687
gpr = obj [gpr ]
658
- ping = Grouping (group_axis , gpr , name = name , level = level )
688
+ ping = Grouping (group_axis , gpr , name = name , level = level , sort = sort )
659
689
if ping .name is None :
660
690
ping .name = 'key_%d' % i
661
691
groupings .append (ping )
@@ -785,7 +815,7 @@ def _get_index():
785
815
index = MultiIndex .from_tuples (keys , names = key_names )
786
816
else :
787
817
ping = self .groupings [0 ]
788
- if len (keys ) == len ( ping .counts ) :
818
+ if len (keys ) == ping .ngroups :
789
819
index = ping .group_index
790
820
index .name = key_names [0 ]
791
821
else :
@@ -1056,7 +1086,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
1056
1086
key_index = MultiIndex .from_tuples (keys , names = key_names )
1057
1087
else :
1058
1088
ping = self .groupings [0 ]
1059
- if len (keys ) == len ( ping .counts ) :
1089
+ if len (keys ) == ping .ngroups :
1060
1090
key_index = ping .group_index
1061
1091
key_index .name = key_names [0 ]
1062
1092
@@ -1235,6 +1265,9 @@ def slicer(data, slob):
1235
1265
yield i , slicer (sorted_data , slice (start , end ))
1236
1266
1237
1267
def get_group_index (label_list , shape ):
1268
+ if len (label_list ) == 1 :
1269
+ return label_list [0 ]
1270
+
1238
1271
n = len (label_list [0 ])
1239
1272
group_index = np .zeros (n , dtype = int )
1240
1273
mask = np .zeros (n , dtype = bool )
@@ -1353,11 +1386,6 @@ def _groupby_indices(values):
1353
1386
values = values .astype ('O' )
1354
1387
return lib .groupby_indices (values )
1355
1388
1356
- def _group_labels (values ):
1357
- if values .dtype != np .object_ :
1358
- values = values .astype ('O' )
1359
- return lib .group_labels (values )
1360
-
1361
1389
def _ensure_platform_int (labels ):
1362
1390
if labels .dtype != np .int_ :
1363
1391
labels = labels .astype (np .int_ )
@@ -1367,25 +1395,3 @@ def _ensure_int64(labels):
1367
1395
if labels .dtype != np .int64 :
1368
1396
labels = labels .astype (np .int64 )
1369
1397
return labels
1370
-
1371
- def sort_group_labels (ids , labels , counts ):
1372
- n = len (ids )
1373
-
1374
- # corner all NA case
1375
- if n == 0 :
1376
- return ids , labels , counts
1377
-
1378
- rng = np .arange (n )
1379
- values = Series (ids , index = rng , dtype = object ).values
1380
- indexer = values .argsort ()
1381
-
1382
- reverse_indexer = np .empty (n , dtype = np .int32 )
1383
- reverse_indexer .put (indexer , np .arange (n ))
1384
-
1385
- new_labels = reverse_indexer .take (labels )
1386
- np .putmask (new_labels , labels == - 1 , - 1 )
1387
-
1388
- new_ids = dict (izip (rng , values .take (indexer )))
1389
- new_counts = counts .take (indexer )
1390
-
1391
- return new_ids , new_labels , new_counts
0 commit comments