@@ -1367,30 +1367,16 @@ def group_info(self):
1367
1367
1368
1368
def _get_compressed_labels (self ):
1369
1369
all_labels = [ping .labels for ping in self .groupings ]
1370
- if self ._overflow_possible :
1371
- tups = lib .fast_zip (all_labels )
1372
- labs , uniques = algos .factorize (tups )
1370
+ if len (all_labels ) > 1 :
1371
+ group_index = get_group_index (all_labels , self .shape ,
1372
+ sort = True , xnull = True )
1373
+ return _compress_group_index (group_index )
1373
1374
1374
- if self .sort :
1375
- uniques , labs = _reorder_by_uniques (uniques , labs )
1375
+ ping = self .groupings [0 ]
1376
+ self .compressed = False
1377
+ self ._filter_empty_groups = False
1376
1378
1377
- return labs , uniques
1378
- else :
1379
- if len (all_labels ) > 1 :
1380
- group_index = get_group_index (all_labels , self .shape )
1381
- comp_ids , obs_group_ids = _compress_group_index (group_index )
1382
- else :
1383
- ping = self .groupings [0 ]
1384
- comp_ids = ping .labels
1385
- obs_group_ids = np .arange (len (ping .group_index ))
1386
- self .compressed = False
1387
- self ._filter_empty_groups = False
1388
-
1389
- return comp_ids , obs_group_ids
1390
-
1391
- @cache_readonly
1392
- def _overflow_possible (self ):
1393
- return _int64_overflow_possible (self .shape )
1379
+ return ping .labels , np .arange (len (ping .group_index ))
1394
1380
1395
1381
@cache_readonly
1396
1382
def ngroups (self ):
@@ -1402,15 +1388,13 @@ def result_index(self):
1402
1388
return MultiIndex .from_arrays (recons , names = self .names )
1403
1389
1404
1390
def get_group_levels (self ):
1405
- obs_ids = self .group_info [ 1 ]
1391
+ comp_ids , obs_ids , _ = self .group_info
1406
1392
1407
1393
if not self .compressed and len (self .groupings ) == 1 :
1408
1394
return [self .groupings [0 ].group_index ]
1409
1395
1410
- if self ._overflow_possible :
1411
- recons_labels = [np .array (x ) for x in zip (* obs_ids )]
1412
- else :
1413
- recons_labels = decons_group_index (obs_ids , self .shape )
1396
+ recons_labels = decons_obs_group_ids (comp_ids , obs_ids ,
1397
+ self .shape , (ping .labels for ping in self .groupings ))
1414
1398
1415
1399
name_list = []
1416
1400
for ping , labels in zip (self .groupings , recons_labels ):
@@ -3490,42 +3474,28 @@ def get_splitter(data, *args, **kwargs):
3490
3474
# Misc utilities
3491
3475
3492
3476
3493
- def get_group_index (label_list , shape ):
3477
+ def get_group_index (labels , shape , sort , xnull ):
3494
3478
"""
3495
3479
For the particular label_list, gets the offsets into the hypothetical list
3496
3480
representing the totally ordered cartesian product of all possible label
3497
- combinations.
3498
- """
3499
- if len (label_list ) == 1 :
3500
- return label_list [0 ]
3501
-
3502
- n = len (label_list [0 ])
3503
- group_index = np .zeros (n , dtype = np .int64 )
3504
- mask = np .zeros (n , dtype = bool )
3505
- for i in range (len (shape )):
3506
- stride = np .prod ([x for x in shape [i + 1 :]], dtype = np .int64 )
3507
- group_index += com ._ensure_int64 (label_list [i ]) * stride
3508
- mask |= label_list [i ] < 0
3509
-
3510
- np .putmask (group_index , mask , - 1 )
3511
- return group_index
3512
-
3513
-
3514
- def get_flat_ids (labels , shape , retain_lex_rank ):
3515
- """
3516
- Given a list of labels at each level, returns a flat array of int64 ids
3517
- corresponding to unique tuples across the labels. If `retain_lex_rank`,
3518
- rank of returned ids preserve lexical ranks of labels.
3481
+ combinations, *as long as* this space fits within int64 bounds;
3482
+ otherwise, though group indices identify unique combinations of
3483
+ labels, they cannot be deconstructed.
3484
+ - If `sort`, rank of returned ids preserve lexical ranks of labels.
3485
+ i.e. returned id's can be used to do lexical sort on labels;
3486
+ - If `xnull` nulls (-1 labels) are passed through.
3519
3487
3520
3488
Parameters
3521
3489
----------
3522
3490
labels: sequence of arrays
3523
3491
Integers identifying levels at each location
3524
3492
shape: sequence of ints same length as labels
3525
3493
Number of unique levels at each location
3526
- retain_lex_rank : boolean
3494
+ sort : boolean
3527
3495
If the ranks of returned ids should match lexical ranks of labels
3528
-
3496
+ xnull: boolean
3497
+ If true nulls are eXcluded. i.e. -1 values in the labels are
3498
+ passed through
3529
3499
Returns
3530
3500
-------
3531
3501
An array of type int64 where two elements are equal if their corresponding
@@ -3544,12 +3514,18 @@ def loop(labels, shape):
3544
3514
stride //= shape [i ]
3545
3515
out += labels [i ] * stride
3546
3516
3517
+ if xnull : # exclude nulls
3518
+ mask = labels [0 ] == - 1
3519
+ for lab in labels [1 :nlev ]:
3520
+ mask |= lab == - 1
3521
+ out [mask ] = - 1
3522
+
3547
3523
if nlev == len (shape ): # all levels done!
3548
3524
return out
3549
3525
3550
3526
# compress what has been done so far in order to avoid overflow
3551
3527
# to retain lexical ranks, obs_ids should be sorted
3552
- comp_ids , obs_ids = _compress_group_index (out , sort = retain_lex_rank )
3528
+ comp_ids , obs_ids = _compress_group_index (out , sort = sort )
3553
3529
3554
3530
labels = [comp_ids ] + labels [nlev :]
3555
3531
shape = [len (obs_ids )] + shape [nlev :]
@@ -3560,9 +3536,10 @@ def maybe_lift(lab, size): # pormote nan values
3560
3536
return (lab + 1 , size + 1 ) if (lab == - 1 ).any () else (lab , size )
3561
3537
3562
3538
labels = map (com ._ensure_int64 , labels )
3563
- labels , shape = map (list , zip (* map (maybe_lift , labels , shape )))
3539
+ if not xnull :
3540
+ labels , shape = map (list , zip (* map (maybe_lift , labels , shape )))
3564
3541
3565
- return loop (labels , shape )
3542
+ return loop (list ( labels ), list ( shape ) )
3566
3543
3567
3544
3568
3545
_INT64_MAX = np .iinfo (np .int64 ).max
@@ -3578,6 +3555,11 @@ def _int64_overflow_possible(shape):
3578
3555
3579
3556
def decons_group_index (comp_labels , shape ):
3580
3557
# reconstruct labels
3558
+ if _int64_overflow_possible (shape ):
3559
+ # at some point group indices are factorized,
3560
+ # and may not be deconstructed here! wrong path!
3561
+ raise ValueError ('cannot deconstruct factorized group indices!' )
3562
+
3581
3563
label_list = []
3582
3564
factor = 1
3583
3565
y = 0
@@ -3591,12 +3573,25 @@ def decons_group_index(comp_labels, shape):
3591
3573
return label_list [::- 1 ]
3592
3574
3593
3575
3576
+ def decons_obs_group_ids (comp_ids , obs_ids , shape , labels ):
3577
+ """reconstruct labels from observed ids"""
3578
+ from pandas .hashtable import unique_label_indices
3579
+
3580
+ if not _int64_overflow_possible (shape ):
3581
+ # obs ids are deconstructable! take the fast route!
3582
+ return decons_group_index (obs_ids , shape )
3583
+
3584
+ i = unique_label_indices (comp_ids )
3585
+ i8copy = lambda a : a .astype ('i8' , subok = False , copy = True )
3586
+ return [i8copy (lab [i ]) for lab in labels ]
3587
+
3588
+
3594
3589
def _indexer_from_factorized (labels , shape , compress = True ):
3595
3590
if _int64_overflow_possible (shape ):
3596
3591
indexer = np .lexsort (np .array (labels [::- 1 ]))
3597
3592
return indexer
3598
3593
3599
- group_index = get_group_index (labels , shape )
3594
+ group_index = get_group_index (labels , shape , sort = True , xnull = True )
3600
3595
3601
3596
if compress :
3602
3597
comp_ids , obs_ids = _compress_group_index (group_index )
@@ -3712,9 +3707,12 @@ def get_key(self, comp_id):
3712
3707
3713
3708
def _get_indices_dict (label_list , keys ):
3714
3709
shape = list (map (len , keys ))
3715
- ngroups = np .prod (shape )
3716
3710
3717
- group_index = get_group_index (label_list , shape )
3711
+ group_index = get_group_index (label_list , shape , sort = True , xnull = True )
3712
+ ngroups = ((group_index .size and group_index .max ()) + 1 ) \
3713
+ if _int64_overflow_possible (shape ) \
3714
+ else np .prod (shape , dtype = 'i8' )
3715
+
3718
3716
sorter = _get_group_index_sorter (group_index , ngroups )
3719
3717
3720
3718
sorted_labels = [lab .take (sorter ) for lab in label_list ]
0 commit comments