BUG: Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (originally GH4771),

jreback · jreback · commit e338862e5fa6 · 2013-09-25T19:59:37.000-04:00
fixed again is (GH4975)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -432,7 +432,7 @@ Bug Fixes
   - Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`)
   - Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`)
   - Bug in setting with ``loc/ix`` a single indexer with a multi-index axis and a numpy array, related to (:issue:`3777`)
-  - Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (:issue:`4771`)
+  - Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (:issue:`4771`, :issue:`4975`)
   - Bug in ``iloc`` with a slice index failing (:issue:`4771`)
   - Incorrect error message with no colspecs or width in ``read_fwf``. (:issue:`4774`)
   - Fix bugs in indexing in a Series with a duplicate index (:issue:`4548`, :issue:`4550`)
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -655,6 +655,7 @@ def __init__(self, data_list, join_index, indexers, axis=1, copy=True):
         self.join_index = join_index
         self.axis = axis
         self.copy = copy
+        self.offsets = None
 
         # do NOT sort
         self.result_items = _concat_indexes([d.items for d in data_list])
@@ -683,14 +684,29 @@ def get_result(self):
         blockmaps = self._prepare_blocks()
         kinds = _get_merge_block_kinds(blockmaps)
 
-        result_blocks = []
-
         # maybe want to enable flexible copying <-- what did I mean?
+        kind_blocks = []
         for klass in kinds:
             klass_blocks = []
             for unit, mapping in blockmaps:
                 if klass in mapping:
                     klass_blocks.extend((unit, b) for b in mapping[klass])
+
+            # blocks that we are going to merge
+            kind_blocks.append(klass_blocks)
+
+        # create the merge offsets, essentially where the resultant blocks go in the result
+        if not self.result_items.is_unique:
+
+            # length of the merges for each of the klass blocks
+            self.offsets = np.zeros(len(blockmaps))
+            for kb in kind_blocks:
+                kl = list(b.get_merge_length() for unit, b in kb)
+                self.offsets += np.array(kl)
+
+        # merge the blocks to create the result blocks
+        result_blocks = []
+        for klass_blocks in kind_blocks:
             res_blk = self._get_merged_block(klass_blocks)
             result_blocks.append(res_blk)
 
@@ -726,7 +742,8 @@ def _merge_blocks(self, merge_chunks):
 
         n = len(fidx) if fidx is not None else out_shape[self.axis]
 
-        out_shape[0] = sum(blk.get_merge_length() for unit, blk in merge_chunks)
+        merge_lengths = list(blk.get_merge_length() for unit, blk in merge_chunks)
+        out_shape[0] = sum(merge_lengths)
         out_shape[self.axis] = n
 
         # Should use Fortran order??
@@ -746,9 +763,8 @@ def _merge_blocks(self, merge_chunks):
         # calculate by the existing placement plus the offset in the result set
         placement = None
         if not self.result_items.is_unique:
-            nchunks = len(merge_chunks)
-            offsets = np.array([0] + [ len(self.result_items) / nchunks ] * (nchunks-1)).cumsum()
             placement = []
+            offsets = np.append(np.array([0]),self.offsets.cumsum()[:-1])
             for (unit, blk), offset in zip(merge_chunks,offsets):
                 placement.extend(blk.ref_locs+offset)
 
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -15,7 +15,8 @@
 from pandas.tools.merge import merge, concat, ordered_merge, MergeError
 from pandas.util.testing import (assert_frame_equal, assert_series_equal,
                                  assert_almost_equal, rands,
-                                 makeCustomDataframe as mkdf)
+                                 makeCustomDataframe as mkdf,
+                                 assertRaisesRegexp)
 from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range
 import pandas.algos as algos
 import pandas.util.testing as tm
@@ -1435,6 +1436,8 @@ def test_dups_index(self):
         assert_frame_equal(result, expected)
 
     def test_join_dups(self):
+
+        # joining dups
         df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
                      DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
                     axis=1)
@@ -1444,6 +1447,18 @@ def test_join_dups(self):
         result.columns = expected.columns
         assert_frame_equal(result, expected)
 
+        # GH 4975, invalid join on dups
+        w = DataFrame(np.random.randn(4,2), columns=["x", "y"])
+        x = DataFrame(np.random.randn(4,2), columns=["x", "y"])
+        y = DataFrame(np.random.randn(4,2), columns=["x", "y"])
+        z = DataFrame(np.random.randn(4,2), columns=["x", "y"])
+
+        dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer")
+        dta = dta.merge(w, left_index=True, right_index=True)
+        expected = concat([x,y,z,w],axis=1)
+        expected.columns=['x_x','y_x','x_y','y_y','x_x','y_x','x_y','y_y']
+        assert_frame_equal(dta,expected)
+
     def test_handle_empty_objects(self):
         df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))