@@ -56,15 +56,11 @@ def _gi(self, arg):
56
56
@property
57
57
def ref_locs (self ):
58
58
if self ._ref_locs is None :
59
- ri = self .ref_items
60
- if ri .is_unique :
61
- indexer = ri .get_indexer (self .items )
62
- indexer = com ._ensure_platform_int (indexer )
63
- if (indexer == - 1 ).any ():
64
- raise AssertionError ('Some block items were not in block '
65
- 'ref_items' )
66
- else :
67
- indexer = np .arange (len (ri ))
59
+ indexer = self .ref_items .get_indexer (self .items )
60
+ indexer = com ._ensure_platform_int (indexer )
61
+ if (indexer == - 1 ).any ():
62
+ raise AssertionError ('Some block items were not in block '
63
+ 'ref_items' )
68
64
69
65
self ._ref_locs = indexer
70
66
return self ._ref_locs
@@ -884,7 +880,7 @@ class BlockManager(object):
884
880
-----
885
881
This is *not* a public API class
886
882
"""
887
- __slots__ = ['axes' , 'blocks' , '_known_consolidated' , '_is_consolidated' ]
883
+ __slots__ = ['axes' , 'blocks' , '_known_consolidated' , '_is_consolidated' , '_ref_locs' ]
888
884
889
885
def __init__ (self , blocks , axes , do_integrity_check = True ):
890
886
self .axes = [_ensure_index (ax ) for ax in axes ]
@@ -920,11 +916,83 @@ def set_axis(self, axis, value):
920
916
if len (value ) != len (cur_axis ):
921
917
raise Exception ('Length mismatch (%d vs %d)'
922
918
% (len (value ), len (cur_axis )))
919
+
923
920
self .axes [axis ] = value
924
921
925
922
if axis == 0 :
926
- for block in self .blocks :
927
- block .set_ref_items (self .items , maybe_rename = True )
923
+ # unique, we can take
924
+ if cur_axis .is_unique :
925
+ for block in self .blocks :
926
+ block .set_ref_items (self .items , maybe_rename = True )
927
+
928
+ # compute a duplicate indexer that we can use to take
929
+ # the new items from ref_items (in place of _ref_items)
930
+ else :
931
+ self .set_ref_locs (cur_axis )
932
+ for block in self .blocks :
933
+ block .set_ref_items (self .items , maybe_rename = True )
934
+
935
+ def set_ref_locs (self , labels = None ):
936
+ # if we have a non-unique index on this axis, set the indexers
937
+ # we need to set an absolute indexer for the blocks
938
+ # return the indexer if we are not unique
939
+ if labels is None :
940
+ labels = self .items
941
+
942
+ if labels .is_unique :
943
+ return None
944
+
945
+ #### THIS IS POTENTIALLY VERY SLOW #####
946
+
947
+ # if we are already computed, then we are done
948
+ if getattr (self ,'_ref_locs' ,None ) is not None :
949
+ return self ._ref_locs
950
+
951
+ blocks = self .blocks
952
+
953
+ # initialize
954
+ blockmap = dict ()
955
+ for b in blocks :
956
+ arr = np .empty (len (b .items ),dtype = 'int64' )
957
+ arr .fill (- 1 )
958
+ b ._ref_locs = arr
959
+
960
+ # add this block to the blockmap for each
961
+ # of the items in the block
962
+ for item in b .items :
963
+ if item not in blockmap :
964
+ blockmap [item ] = []
965
+ blockmap [item ].append (b )
966
+
967
+ rl = np .empty (len (labels ),dtype = object )
968
+ for i , item in enumerate (labels .values ):
969
+
970
+ try :
971
+ block = blockmap [item ].pop (0 )
972
+ except :
973
+ raise Exception ("not enough items in set_ref_locs" )
974
+
975
+ indexer = np .arange (len (block .items ))
976
+ mask = (block .items == item ) & (block ._ref_locs == - 1 )
977
+ if not mask .any ():
978
+
979
+ # this case will catch a comparison of a index of tuples
980
+ mask = np .empty (len (block .items ),dtype = bool )
981
+ mask .fill (False )
982
+ for j , (bitem , brl ) in enumerate (zip (block .items ,block ._ref_locs )):
983
+ mask [j ] = bitem == item and brl == - 1
984
+
985
+ indices = indexer [mask ]
986
+ if len (indices ):
987
+ idx = indices [0 ]
988
+ else :
989
+ raise Exception ("already set too many items in set_ref_locs" )
990
+
991
+ block ._ref_locs [idx ] = i
992
+ rl [i ] = (block ,idx )
993
+
994
+ self ._ref_locs = rl
995
+ return rl
928
996
929
997
# make items read only for now
930
998
def _get_items (self ):
@@ -1392,26 +1460,11 @@ def iget(self, i):
1392
1460
item = self .items [i ]
1393
1461
if self .items .is_unique :
1394
1462
return self .get (item )
1395
- else :
1396
- # ugh
1397
- try :
1398
- inds , = (self .items == item ).nonzero ()
1399
- except AttributeError : # MultiIndex
1400
- inds , = self .items .map (lambda x : x == item ).nonzero ()
1401
-
1402
- _ , block = self ._find_block (item )
1403
-
1404
- try :
1405
- binds , = (block .items == item ).nonzero ()
1406
- except AttributeError : # MultiIndex
1407
- binds , = block .items .map (lambda x : x == item ).nonzero ()
1408
1463
1409
- for j , (k , b ) in enumerate (zip (inds , binds )):
1410
- if i == k :
1411
- return block .values [b ]
1412
-
1413
- raise Exception ('Cannot have duplicate column names '
1414
- 'split across dtypes' )
1464
+ # compute the duplicative indexer if needed
1465
+ ref_locs = self .set_ref_locs ()
1466
+ b , loc = ref_locs [i ]
1467
+ return b .values [loc ]
1415
1468
1416
1469
def get_scalar (self , tup ):
1417
1470
"""
@@ -1587,6 +1640,8 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
1587
1640
# keep track of what items aren't found anywhere
1588
1641
mask = np .zeros (len (item_order ), dtype = bool )
1589
1642
1643
+ new_axes = [new_items ] + self .axes [1 :]
1644
+
1590
1645
new_blocks = []
1591
1646
for blk in self .blocks :
1592
1647
blk_indexer = blk .items .get_indexer (item_order )
@@ -1610,7 +1665,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
1610
1665
new_blocks .append (na_block )
1611
1666
new_blocks = _consolidate (new_blocks , new_items )
1612
1667
1613
- return BlockManager (new_blocks , [ new_items ] + self . axes [ 1 :] )
1668
+ return BlockManager (new_blocks , new_axes )
1614
1669
1615
1670
def reindex_items (self , new_items , copy = True , fill_value = np .nan ):
1616
1671
"""
@@ -1624,6 +1679,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
1624
1679
1625
1680
# TODO: this part could be faster (!)
1626
1681
new_items , indexer = self .items .reindex (new_items )
1682
+ new_axes = [new_items ] + self .axes [1 :]
1627
1683
1628
1684
# could have so me pathological (MultiIndex) issues here
1629
1685
new_blocks = []
@@ -1648,7 +1704,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
1648
1704
new_blocks .append (na_block )
1649
1705
new_blocks = _consolidate (new_blocks , new_items )
1650
1706
1651
- return BlockManager (new_blocks , [ new_items ] + self . axes [ 1 :] )
1707
+ return BlockManager (new_blocks , new_axes )
1652
1708
1653
1709
def _make_na_block (self , items , ref_items , fill_value = np .nan ):
1654
1710
# TODO: infer dtypes other than float64 from fill_value
@@ -1690,11 +1746,11 @@ def merge(self, other, lsuffix=None, rsuffix=None):
1690
1746
this , other = self ._maybe_rename_join (other , lsuffix , rsuffix )
1691
1747
1692
1748
cons_items = this .items + other .items
1693
- consolidated = _consolidate (this .blocks + other .blocks , cons_items )
1694
-
1695
1749
new_axes = list (this .axes )
1696
1750
new_axes [0 ] = cons_items
1697
1751
1752
+ consolidated = _consolidate (this .blocks + other .blocks , cons_items )
1753
+
1698
1754
return BlockManager (consolidated , new_axes )
1699
1755
1700
1756
def _maybe_rename_join (self , other , lsuffix , rsuffix , copydata = True ):
@@ -1907,7 +1963,6 @@ def form_blocks(arrays, names, axes):
1907
1963
1908
1964
na_block = make_block (block_values , extra_items , items )
1909
1965
blocks .append (na_block )
1910
- blocks = _consolidate (blocks , items )
1911
1966
1912
1967
return blocks
1913
1968
@@ -1958,16 +2013,21 @@ def _shape_compat(x):
1958
2013
1959
2014
names , arrays = zip (* tuples )
1960
2015
1961
- # index may box values
1962
- items = ref_items [ref_items .isin (names )]
1963
-
1964
2016
first = arrays [0 ]
1965
2017
shape = (len (arrays ),) + _shape_compat (first )
1966
2018
1967
2019
stacked = np .empty (shape , dtype = dtype )
1968
2020
for i , arr in enumerate (arrays ):
1969
2021
stacked [i ] = _asarray_compat (arr )
1970
2022
2023
+ # index may box values
2024
+ if ref_items .is_unique :
2025
+ items = ref_items [ref_items .isin (names )]
2026
+ else :
2027
+ items = _ensure_index ([ n for n in names if n in ref_items ])
2028
+ if len (items ) != len (stacked ):
2029
+ raise Exception ("invalid names passed _stack_arrays" )
2030
+
1971
2031
return items , stacked
1972
2032
1973
2033
0 commit comments