Skip to content

Commit e338862

Browse files
committed
BUG: Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (originally GH4771),
fixed again is (GH4975)
1 parent 0fa4d06 commit e338862

File tree

3 files changed

+38
-7
lines changed

3 files changed

+38
-7
lines changed

doc/source/release.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ Bug Fixes
432432
- Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`)
433433
- Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`)
434434
- Bug in setting with ``loc/ix`` a single indexer with a multi-index axis and a numpy array, related to (:issue:`3777`)
435-
- Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (:issue:`4771`)
435+
- Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (:issue:`4771`, :issue:`4975`)
436436
- Bug in ``iloc`` with a slice index failing (:issue:`4771`)
437437
- Incorrect error message with no colspecs or width in ``read_fwf``. (:issue:`4774`)
438438
- Fix bugs in indexing in a Series with a duplicate index (:issue:`4548`, :issue:`4550`)

pandas/tools/merge.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,7 @@ def __init__(self, data_list, join_index, indexers, axis=1, copy=True):
655655
self.join_index = join_index
656656
self.axis = axis
657657
self.copy = copy
658+
self.offsets = None
658659

659660
# do NOT sort
660661
self.result_items = _concat_indexes([d.items for d in data_list])
@@ -683,14 +684,29 @@ def get_result(self):
683684
blockmaps = self._prepare_blocks()
684685
kinds = _get_merge_block_kinds(blockmaps)
685686

686-
result_blocks = []
687-
688687
# maybe want to enable flexible copying <-- what did I mean?
688+
kind_blocks = []
689689
for klass in kinds:
690690
klass_blocks = []
691691
for unit, mapping in blockmaps:
692692
if klass in mapping:
693693
klass_blocks.extend((unit, b) for b in mapping[klass])
694+
695+
# blocks that we are going to merge
696+
kind_blocks.append(klass_blocks)
697+
698+
# create the merge offsets, essentially where the resultant blocks go in the result
699+
if not self.result_items.is_unique:
700+
701+
# length of the merges for each of the klass blocks
702+
self.offsets = np.zeros(len(blockmaps))
703+
for kb in kind_blocks:
704+
kl = list(b.get_merge_length() for unit, b in kb)
705+
self.offsets += np.array(kl)
706+
707+
# merge the blocks to create the result blocks
708+
result_blocks = []
709+
for klass_blocks in kind_blocks:
694710
res_blk = self._get_merged_block(klass_blocks)
695711
result_blocks.append(res_blk)
696712

@@ -726,7 +742,8 @@ def _merge_blocks(self, merge_chunks):
726742

727743
n = len(fidx) if fidx is not None else out_shape[self.axis]
728744

729-
out_shape[0] = sum(blk.get_merge_length() for unit, blk in merge_chunks)
745+
merge_lengths = list(blk.get_merge_length() for unit, blk in merge_chunks)
746+
out_shape[0] = sum(merge_lengths)
730747
out_shape[self.axis] = n
731748

732749
# Should use Fortran order??
@@ -746,9 +763,8 @@ def _merge_blocks(self, merge_chunks):
746763
# calculate by the existing placement plus the offset in the result set
747764
placement = None
748765
if not self.result_items.is_unique:
749-
nchunks = len(merge_chunks)
750-
offsets = np.array([0] + [ len(self.result_items) / nchunks ] * (nchunks-1)).cumsum()
751766
placement = []
767+
offsets = np.append(np.array([0]),self.offsets.cumsum()[:-1])
752768
for (unit, blk), offset in zip(merge_chunks,offsets):
753769
placement.extend(blk.ref_locs+offset)
754770

pandas/tools/tests/test_merge.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from pandas.tools.merge import merge, concat, ordered_merge, MergeError
1616
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
1717
assert_almost_equal, rands,
18-
makeCustomDataframe as mkdf)
18+
makeCustomDataframe as mkdf,
19+
assertRaisesRegexp)
1920
from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range
2021
import pandas.algos as algos
2122
import pandas.util.testing as tm
@@ -1435,6 +1436,8 @@ def test_dups_index(self):
14351436
assert_frame_equal(result, expected)
14361437

14371438
def test_join_dups(self):
1439+
1440+
# joining dups
14381441
df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
14391442
DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
14401443
axis=1)
@@ -1444,6 +1447,18 @@ def test_join_dups(self):
14441447
result.columns = expected.columns
14451448
assert_frame_equal(result, expected)
14461449

1450+
# GH 4975, invalid join on dups
1451+
w = DataFrame(np.random.randn(4,2), columns=["x", "y"])
1452+
x = DataFrame(np.random.randn(4,2), columns=["x", "y"])
1453+
y = DataFrame(np.random.randn(4,2), columns=["x", "y"])
1454+
z = DataFrame(np.random.randn(4,2), columns=["x", "y"])
1455+
1456+
dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer")
1457+
dta = dta.merge(w, left_index=True, right_index=True)
1458+
expected = concat([x,y,z,w],axis=1)
1459+
expected.columns=['x_x','y_x','x_y','y_y','x_x','y_x','x_y','y_y']
1460+
assert_frame_equal(dta,expected)
1461+
14471462
def test_handle_empty_objects(self):
14481463
df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))
14491464

0 commit comments

Comments
 (0)