ENH: support for having duplicative indices across blocks (dtypes)

jreback · jreback · commit 4c756e207ac8 · 2013-05-01T20:44:39.000-04:00
BUG: fix construction of a DataFrame with duplicative indices
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -62,6 +62,12 @@ pandas 0.11.1
     to base dtypes correctly (GH3480_)
   - Fix issue when storing uint dtypes in an HDFStore. (GH3493_)
   - Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_)
+  - ref_locs support to allow duplicative indices across dtypes (GH3468_)
+  - Non-unique index support clarified (GH3468_)
+
+    - Fix assigning a new index to a duplicate index in a DataFrame would fail
+    - Fix construction of a DataFrame with a duplicate index
+    - ref_locs support to allow duplicative indices across dtypes
 
 .. _GH3164: https://github.com/pydata/pandas/issues/3164
 .. _GH3251: https://github.com/pydata/pandas/issues/3251
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -56,15 +56,11 @@ def _gi(self, arg):
     @property
     def ref_locs(self):
         if self._ref_locs is None:
-            ri = self.ref_items
-            if ri.is_unique:
-                indexer = ri.get_indexer(self.items)
-                indexer = com._ensure_platform_int(indexer)
-                if (indexer == -1).any():
-                    raise AssertionError('Some block items were not in block '
-                                         'ref_items')
-            else:
-                indexer = np.arange(len(ri))
+            indexer = self.ref_items.get_indexer(self.items)
+            indexer = com._ensure_platform_int(indexer)
+            if (indexer == -1).any():
+                raise AssertionError('Some block items were not in block '
+                                     'ref_items')
 
             self._ref_locs = indexer
         return self._ref_locs
@@ -884,7 +880,7 @@ class BlockManager(object):
     -----
     This is *not* a public API class
     """
-    __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated']
+    __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs']
 
     def __init__(self, blocks, axes, do_integrity_check=True):
         self.axes = [_ensure_index(ax) for ax in axes]
@@ -920,11 +916,83 @@ def set_axis(self, axis, value):
         if len(value) != len(cur_axis):
             raise Exception('Length mismatch (%d vs %d)'
                             % (len(value), len(cur_axis)))
+
         self.axes[axis] = value
 
         if axis == 0:
-            for block in self.blocks:
-                block.set_ref_items(self.items, maybe_rename=True)
+            # unique, we can take
+            if cur_axis.is_unique:
+                for block in self.blocks:
+                    block.set_ref_items(self.items, maybe_rename=True)
+
+            # compute a duplicate indexer that we can use to take
+            # the new items from ref_items (in place of _ref_items)
+            else:
+                self.set_ref_locs(cur_axis)
+                for block in self.blocks:
+                    block.set_ref_items(self.items, maybe_rename=True)
+
+    def set_ref_locs(self, labels = None):
+        # if we have a non-unique index on this axis, set the indexers
+        # we need to set an absolute indexer for the blocks
+        # return the indexer if we are not unique
+        if labels is None:
+            labels = self.items
+
+        if labels.is_unique: 
+            return None
+
+        #### THIS IS POTENTIALLY VERY SLOW #####
+
+        # if we are already computed, then we are done
+        if getattr(self,'_ref_locs',None) is not None:
+            return self._ref_locs
+
+        blocks = self.blocks
+
+        # initialize
+        blockmap = dict()
+        for b in blocks:
+            arr = np.empty(len(b.items),dtype='int64')
+            arr.fill(-1)
+            b._ref_locs = arr
+
+            # add this block to the blockmap for each
+            # of the items in the block
+            for item in b.items:
+               if item not in blockmap:
+                   blockmap[item] = []
+               blockmap[item].append(b)
+
+        rl = np.empty(len(labels),dtype=object)
+        for i, item in enumerate(labels.values):
+
+            try:
+                block = blockmap[item].pop(0)
+            except:
+                raise Exception("not enough items in set_ref_locs")
+
+            indexer = np.arange(len(block.items))
+            mask = (block.items == item) & (block._ref_locs == -1)
+            if not mask.any():
+
+                # this case will catch a comparison of a index of tuples
+                mask = np.empty(len(block.items),dtype=bool)
+                mask.fill(False)
+                for j, (bitem, brl) in enumerate(zip(block.items,block._ref_locs)):
+                    mask[j] = bitem == item and brl == -1
+
+            indices = indexer[mask]
+            if len(indices):
+                idx = indices[0]
+            else:
+                raise Exception("already set too many items in set_ref_locs")
+
+            block._ref_locs[idx] = i
+            rl[i] = (block,idx)
+           
+        self._ref_locs = rl
+        return rl
 
     # make items read only for now
     def _get_items(self):
@@ -1392,26 +1460,11 @@ def iget(self, i):
         item = self.items[i]
         if self.items.is_unique:
             return self.get(item)
-        else:
-            # ugh
-            try:
-                inds, = (self.items == item).nonzero()
-            except AttributeError:  # MultiIndex
-                inds, = self.items.map(lambda x: x == item).nonzero()
-
-            _, block = self._find_block(item)
-
-            try:
-                binds, = (block.items == item).nonzero()
-            except AttributeError:  # MultiIndex
-                binds, = block.items.map(lambda x: x == item).nonzero()
 
-            for j, (k, b) in enumerate(zip(inds, binds)):
-                if i == k:
-                    return block.values[b]
-
-            raise Exception('Cannot have duplicate column names '
-                            'split across dtypes')
+        # compute the duplicative indexer if needed
+        ref_locs = self.set_ref_locs()
+        b, loc = ref_locs[i]
+        return b.values[loc]
 
     def get_scalar(self, tup):
         """
@@ -1587,6 +1640,8 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
         # keep track of what items aren't found anywhere
         mask = np.zeros(len(item_order), dtype=bool)
 
+        new_axes = [new_items] + self.axes[1:]
+
         new_blocks = []
         for blk in self.blocks:
             blk_indexer = blk.items.get_indexer(item_order)
@@ -1610,7 +1665,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
             new_blocks.append(na_block)
             new_blocks = _consolidate(new_blocks, new_items)
 
-        return BlockManager(new_blocks, [new_items] + self.axes[1:])
+        return BlockManager(new_blocks, new_axes)
 
     def reindex_items(self, new_items, copy=True, fill_value=np.nan):
         """
@@ -1624,6 +1679,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
 
         # TODO: this part could be faster (!)
         new_items, indexer = self.items.reindex(new_items)
+        new_axes = [new_items] + self.axes[1:]
 
         # could have so me pathological (MultiIndex) issues here
         new_blocks = []
@@ -1648,7 +1704,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
                 new_blocks.append(na_block)
                 new_blocks = _consolidate(new_blocks, new_items)
 
-        return BlockManager(new_blocks, [new_items] + self.axes[1:])
+        return BlockManager(new_blocks, new_axes)
 
     def _make_na_block(self, items, ref_items, fill_value=np.nan):
         # TODO: infer dtypes other than float64 from fill_value
@@ -1690,11 +1746,11 @@ def merge(self, other, lsuffix=None, rsuffix=None):
         this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
 
         cons_items = this.items + other.items
-        consolidated = _consolidate(this.blocks + other.blocks, cons_items)
-
         new_axes = list(this.axes)
         new_axes[0] = cons_items
 
+        consolidated = _consolidate(this.blocks + other.blocks, cons_items)
+
         return BlockManager(consolidated, new_axes)
 
     def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
@@ -1907,7 +1963,6 @@ def form_blocks(arrays, names, axes):
 
         na_block = make_block(block_values, extra_items, items)
         blocks.append(na_block)
-        blocks = _consolidate(blocks, items)
 
     return blocks
 
@@ -1958,16 +2013,21 @@ def _shape_compat(x):
 
     names, arrays = zip(*tuples)
 
-    # index may box values
-    items = ref_items[ref_items.isin(names)]
-
     first = arrays[0]
     shape = (len(arrays),) + _shape_compat(first)
 
     stacked = np.empty(shape, dtype=dtype)
     for i, arr in enumerate(arrays):
         stacked[i] = _asarray_compat(arr)
 
+    # index may box values
+    if ref_items.is_unique:
+        items = ref_items[ref_items.isin(names)]
+    else:
+        items = _ensure_index([ n for n in names if n in ref_items ])
+        if len(items) != len(stacked):
+            raise Exception("invalid names passed _stack_arrays")
+
     return items, stacked
 
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -9204,18 +9204,48 @@ def test_assign_columns(self):
     def test_assign_columns_with_dups(self):
 
         # GH 3468 related
+
+        # basic
         df = DataFrame([[1,2]], columns=['a','a'])
         df.columns = ['a','a.1']
-
+        str(df)
         expected = DataFrame([[1,2]], columns=['a','a.1'])
         assert_frame_equal(df, expected)
 
+        df = DataFrame([[1,2,3]], columns=['b','a','a'])
+        df.columns = ['b','a','a.1']
+        str(df)
+        expected = DataFrame([[1,2,3]], columns=['b','a','a.1'])
+        assert_frame_equal(df, expected)
+
+        # with a dup index
         df = DataFrame([[1,2]], columns=['a','a'])
         df.columns = ['b','b']
-
+        str(df)
         expected = DataFrame([[1,2]], columns=['b','b'])
         assert_frame_equal(df, expected)
 
+        # multi-dtype
+        df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=['a','a','b','b','d','c','c'])
+        df.columns = list('ABCDEFG')
+        str(df)
+        expected = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('ABCDEFG'))
+        assert_frame_equal(df, expected)
+
+        # this is an error because we cannot disambiguate the dup columns
+        self.assertRaises(Exception, lambda x: DataFrame([[1,2,'foo','bar']], columns=['a','a','a','a']))
+
+        # dups across blocks
+        df_float  = DataFrame(np.random.randn(10, 3),dtype='float64')
+        df_int    = DataFrame(np.random.randn(10, 3),dtype='int64')
+        df_bool   = DataFrame(True,index=df_float.index,columns=df_float.columns)
+        df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns)
+        df_dt     = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns)
+        df        = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1)
+
+        result = df._data.set_ref_locs()
+        self.assert_(len(result) == len(df.columns))
+
     def test_cast_internals(self):
         casted = DataFrame(self.frame._data, dtype=int)
         expected = DataFrame(self.frame._series, dtype=int)
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -772,6 +772,13 @@ def test_dups_fancy_indexing(self):
         expected = Index(['b','a','a'])
         self.assert_(result.equals(expected))
 
+        # across dtypes
+        df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa'))
+        result = DataFrame([[1,2,1.,2.,3.,'foo','bar']])
+        result.columns = list('aaaaaaa')
+        assert_frame_equal(df,result)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
@@ -268,7 +268,7 @@ def test_duplicate_item_failure(self):
             b.ref_items = items
 
         mgr = BlockManager(blocks, [items, np.arange(N)])
-        self.assertRaises(Exception, mgr.iget, 1)
+        mgr.iget(1)
 
     def test_contains(self):
         self.assert_('a' in self.mgr)