Merge pull request #4772 from jreback/concat_dups

jreback · jreback · commit 38a87b62ac4e · 2013-09-07T13:57:14.000-07:00
BUG: Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (GH4771)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -331,6 +331,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
   - Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`)
   - Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`)
   - Bug in setting with ``loc/ix`` a single indexer with a multi-index axis and a numpy array, related to (:issue:`3777`)
+  - Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (:issue:`4771`)
+  - Bug in ``iloc`` with a slice index failing (:issue:`4771`)
 
 pandas 0.12
 ===========
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -2174,7 +2174,7 @@ def get_slice(self, slobj, axis=0, raise_on_error=False):
                                   placement=blk._ref_locs)
                 new_blocks = [newb]
             else:
-                return self.reindex_items(new_items)
+                return self.reindex_items(new_items, indexer=np.arange(len(self.items))[slobj])
         else:
             new_blocks = self._slice_blocks(slobj, axis)
 
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -16,7 +16,7 @@
                              MultiIndex, DatetimeIndex, Timestamp)
 from pandas.util.testing import (assert_almost_equal, assert_series_equal,
                                  assert_frame_equal, assert_panel_equal)
-from pandas import compat
+from pandas import compat, concat
 
 import pandas.util.testing as tm
 import pandas.lib as lib
@@ -359,6 +359,29 @@ def test_iloc_getitem_slice(self):
         self.check_result('slice', 'iloc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints'])
         self.check_result('slice', 'iloc', slice(1,3), 'indexer', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails = IndexError)
 
+    def test_iloc_getitem_slice_dups(self):
+
+        df1 = DataFrame(np.random.randn(10,4),columns=['A','A','B','B'])
+        df2 = DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])
+
+        # axis=1
+        df = concat([df1,df2],axis=1)
+        assert_frame_equal(df.iloc[:,:4],df1)
+        assert_frame_equal(df.iloc[:,4:],df2)
+
+        df = concat([df2,df1],axis=1)
+        assert_frame_equal(df.iloc[:,:2],df2)
+        assert_frame_equal(df.iloc[:,2:],df1)
+
+        assert_frame_equal(df.iloc[:,0:3],concat([df2,df1.iloc[:,[0]]],axis=1))
+
+        # axis=0
+        df = concat([df,df],axis=0)
+        assert_frame_equal(df.iloc[0:10,:2],df2)
+        assert_frame_equal(df.iloc[0:10,2:],df1)
+        assert_frame_equal(df.iloc[10:,:2],df2)
+        assert_frame_equal(df.iloc[10:,2:],df1)
+
     def test_iloc_getitem_out_of_bounds(self):
 
         # out-of-bounds slice
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -649,6 +649,7 @@ def __init__(self, data_list, join_index, indexers, axis=1, copy=True):
         for data, indexer in zip(data_list, indexers):
             if not data.is_consolidated():
                 data = data.consolidate()
+            data._set_ref_locs()
             self.units.append(_JoinUnit(data.blocks, indexer))
 
         self.join_index = join_index
@@ -682,7 +683,6 @@ def get_result(self):
         blockmaps = self._prepare_blocks()
         kinds = _get_merge_block_kinds(blockmaps)
 
-        result_is_unique = self.result_axes[0].is_unique
         result_blocks = []
 
         # maybe want to enable flexible copying <-- what did I mean?
@@ -692,23 +692,28 @@ def get_result(self):
                 if klass in mapping:
                     klass_blocks.extend((unit, b) for b in mapping[klass])
             res_blk = self._get_merged_block(klass_blocks)
-
-            # if we have a unique result index, need to clear the _ref_locs
-            # a non-unique is set as we are creating
-            if result_is_unique:
-                res_blk.set_ref_locs(None)
-
             result_blocks.append(res_blk)
 
         return BlockManager(result_blocks, self.result_axes)
 
     def _get_merged_block(self, to_merge):
         if len(to_merge) > 1:
+
+            # placement set here
             return self._merge_blocks(to_merge)
         else:
             unit, block = to_merge[0]
-            return unit.reindex_block(block, self.axis,
-                                      self.result_items, copy=self.copy)
+            blk = unit.reindex_block(block, self.axis,
+                                     self.result_items, copy=self.copy)
+
+            # set placement / invalidate on a unique result
+            if self.result_items.is_unique and blk._ref_locs is not None:
+                if not self.copy:
+                    blk = blk.copy()
+                blk.set_ref_locs(None)
+
+            return blk
+
 
     def _merge_blocks(self, merge_chunks):
         """
@@ -736,7 +741,18 @@ def _merge_blocks(self, merge_chunks):
 
         # does not sort
         new_block_items = _concat_indexes([b.items for _, b in merge_chunks])
-        return make_block(out, new_block_items, self.result_items)
+
+        # need to set placement if we have a non-unique result
+        # calculate by the existing placement plus the offset in the result set
+        placement = None
+        if not self.result_items.is_unique:
+            nchunks = len(merge_chunks)
+            offsets = np.array([0] + [ len(self.result_items) / nchunks ] * (nchunks-1)).cumsum()
+            placement = []
+            for (unit, blk), offset in zip(merge_chunks,offsets):
+                placement.extend(blk.ref_locs+offset)
+
+        return make_block(out, new_block_items, self.result_items, placement=placement)
 
 
 class _JoinUnit(object):
@@ -992,6 +1008,7 @@ def _prepare_blocks(self):
         blockmaps = []
         for data in reindexed_data:
             data = data.consolidate()
+            data._set_ref_locs()
             blockmaps.append(data.get_block_map(typ='dict'))
         return blockmaps, reindexed_data
 
@@ -1063,7 +1080,10 @@ def _concat_blocks(self, blocks):
                 #        or maybe would require performance test)
                 raise PandasError('dtypes are not consistent throughout '
                                   'DataFrames')
-            return make_block(concat_values, blocks[0].items, self.new_axes[0])
+            return make_block(concat_values,
+                              blocks[0].items,
+                              self.new_axes[0],
+                              placement=blocks[0]._ref_locs)
         else:
 
             offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -1396,6 +1396,54 @@ def test_crossed_dtypes_weird_corner(self):
             [df, df2], keys=['one', 'two'], names=['first', 'second'])
         self.assertEqual(result.index.names, ('first', 'second'))
 
+    def test_dups_index(self):
+        # GH 4771
+
+        # single dtypes
+        df = DataFrame(np.random.randint(0,10,size=40).reshape(10,4),columns=['A','A','C','C'])
+
+        result = concat([df,df],axis=1)
+        assert_frame_equal(result.iloc[:,:4],df)
+        assert_frame_equal(result.iloc[:,4:],df)
+
+        result = concat([df,df],axis=0)
+        assert_frame_equal(result.iloc[:10],df)
+        assert_frame_equal(result.iloc[10:],df)
+
+        # multi dtypes
+        df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
+                     DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
+                    axis=1)
+
+        result = concat([df,df],axis=1)
+        assert_frame_equal(result.iloc[:,:6],df)
+        assert_frame_equal(result.iloc[:,6:],df)
+
+        result = concat([df,df],axis=0)
+        assert_frame_equal(result.iloc[:10],df)
+        assert_frame_equal(result.iloc[10:],df)
+
+        # append
+        result = df.iloc[0:8,:].append(df.iloc[8:])
+        assert_frame_equal(result, df)
+
+        result = df.iloc[0:8,:].append(df.iloc[8:9]).append(df.iloc[9:10])
+        assert_frame_equal(result, df)
+
+        expected = concat([df,df],axis=0)
+        result = df.append(df)
+        assert_frame_equal(result, expected)
+
+    def test_join_dups(self):
+        df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
+                     DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
+                    axis=1)
+
+        expected = concat([df,df],axis=1)
+        result = df.join(df,rsuffix='_2')
+        result.columns = expected.columns
+        assert_frame_equal(result, expected)
+
     def test_handle_empty_objects(self):
         df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))