Skip to content

Commit 38a87b6

Browse files
committed
Merge pull request #4772 from jreback/concat_dups
BUG: Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (GH4771)
2 parents c1ab38e + 75d378b commit 38a87b6

File tree

5 files changed

+106
-13
lines changed

5 files changed

+106
-13
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
331331
- Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`)
332332
- Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`)
333333
- Bug in setting with ``loc/ix`` a single indexer with a multi-index axis and a numpy array, related to (:issue:`3777`)
334+
- Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (:issue:`4771`)
335+
- Bug in ``iloc`` with a slice index failing (:issue:`4771`)
334336

335337
pandas 0.12
336338
===========

pandas/core/internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2174,7 +2174,7 @@ def get_slice(self, slobj, axis=0, raise_on_error=False):
21742174
placement=blk._ref_locs)
21752175
new_blocks = [newb]
21762176
else:
2177-
return self.reindex_items(new_items)
2177+
return self.reindex_items(new_items, indexer=np.arange(len(self.items))[slobj])
21782178
else:
21792179
new_blocks = self._slice_blocks(slobj, axis)
21802180

pandas/tests/test_indexing.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
MultiIndex, DatetimeIndex, Timestamp)
1717
from pandas.util.testing import (assert_almost_equal, assert_series_equal,
1818
assert_frame_equal, assert_panel_equal)
19-
from pandas import compat
19+
from pandas import compat, concat
2020

2121
import pandas.util.testing as tm
2222
import pandas.lib as lib
@@ -359,6 +359,29 @@ def test_iloc_getitem_slice(self):
359359
self.check_result('slice', 'iloc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints'])
360360
self.check_result('slice', 'iloc', slice(1,3), 'indexer', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails = IndexError)
361361

362+
def test_iloc_getitem_slice_dups(self):
363+
364+
df1 = DataFrame(np.random.randn(10,4),columns=['A','A','B','B'])
365+
df2 = DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])
366+
367+
# axis=1
368+
df = concat([df1,df2],axis=1)
369+
assert_frame_equal(df.iloc[:,:4],df1)
370+
assert_frame_equal(df.iloc[:,4:],df2)
371+
372+
df = concat([df2,df1],axis=1)
373+
assert_frame_equal(df.iloc[:,:2],df2)
374+
assert_frame_equal(df.iloc[:,2:],df1)
375+
376+
assert_frame_equal(df.iloc[:,0:3],concat([df2,df1.iloc[:,[0]]],axis=1))
377+
378+
# axis=0
379+
df = concat([df,df],axis=0)
380+
assert_frame_equal(df.iloc[0:10,:2],df2)
381+
assert_frame_equal(df.iloc[0:10,2:],df1)
382+
assert_frame_equal(df.iloc[10:,:2],df2)
383+
assert_frame_equal(df.iloc[10:,2:],df1)
384+
362385
def test_iloc_getitem_out_of_bounds(self):
363386

364387
# out-of-bounds slice

pandas/tools/merge.py

+31-11
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,7 @@ def __init__(self, data_list, join_index, indexers, axis=1, copy=True):
649649
for data, indexer in zip(data_list, indexers):
650650
if not data.is_consolidated():
651651
data = data.consolidate()
652+
data._set_ref_locs()
652653
self.units.append(_JoinUnit(data.blocks, indexer))
653654

654655
self.join_index = join_index
@@ -682,7 +683,6 @@ def get_result(self):
682683
blockmaps = self._prepare_blocks()
683684
kinds = _get_merge_block_kinds(blockmaps)
684685

685-
result_is_unique = self.result_axes[0].is_unique
686686
result_blocks = []
687687

688688
# maybe want to enable flexible copying <-- what did I mean?
@@ -692,23 +692,28 @@ def get_result(self):
692692
if klass in mapping:
693693
klass_blocks.extend((unit, b) for b in mapping[klass])
694694
res_blk = self._get_merged_block(klass_blocks)
695-
696-
# if we have a unique result index, need to clear the _ref_locs
697-
# a non-unique is set as we are creating
698-
if result_is_unique:
699-
res_blk.set_ref_locs(None)
700-
701695
result_blocks.append(res_blk)
702696

703697
return BlockManager(result_blocks, self.result_axes)
704698

705699
def _get_merged_block(self, to_merge):
706700
if len(to_merge) > 1:
701+
702+
# placement set here
707703
return self._merge_blocks(to_merge)
708704
else:
709705
unit, block = to_merge[0]
710-
return unit.reindex_block(block, self.axis,
711-
self.result_items, copy=self.copy)
706+
blk = unit.reindex_block(block, self.axis,
707+
self.result_items, copy=self.copy)
708+
709+
# set placement / invalidate on a unique result
710+
if self.result_items.is_unique and blk._ref_locs is not None:
711+
if not self.copy:
712+
blk = blk.copy()
713+
blk.set_ref_locs(None)
714+
715+
return blk
716+
712717

713718
def _merge_blocks(self, merge_chunks):
714719
"""
@@ -736,7 +741,18 @@ def _merge_blocks(self, merge_chunks):
736741

737742
# does not sort
738743
new_block_items = _concat_indexes([b.items for _, b in merge_chunks])
739-
return make_block(out, new_block_items, self.result_items)
744+
745+
# need to set placement if we have a non-unique result
746+
# calculate by the existing placement plus the offset in the result set
747+
placement = None
748+
if not self.result_items.is_unique:
749+
nchunks = len(merge_chunks)
750+
offsets = np.array([0] + [ len(self.result_items) / nchunks ] * (nchunks-1)).cumsum()
751+
placement = []
752+
for (unit, blk), offset in zip(merge_chunks,offsets):
753+
placement.extend(blk.ref_locs+offset)
754+
755+
return make_block(out, new_block_items, self.result_items, placement=placement)
740756

741757

742758
class _JoinUnit(object):
@@ -992,6 +1008,7 @@ def _prepare_blocks(self):
9921008
blockmaps = []
9931009
for data in reindexed_data:
9941010
data = data.consolidate()
1011+
data._set_ref_locs()
9951012
blockmaps.append(data.get_block_map(typ='dict'))
9961013
return blockmaps, reindexed_data
9971014

@@ -1063,7 +1080,10 @@ def _concat_blocks(self, blocks):
10631080
# or maybe would require performance test)
10641081
raise PandasError('dtypes are not consistent throughout '
10651082
'DataFrames')
1066-
return make_block(concat_values, blocks[0].items, self.new_axes[0])
1083+
return make_block(concat_values,
1084+
blocks[0].items,
1085+
self.new_axes[0],
1086+
placement=blocks[0]._ref_locs)
10671087
else:
10681088

10691089
offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for

pandas/tools/tests/test_merge.py

+48
Original file line numberDiff line numberDiff line change
@@ -1396,6 +1396,54 @@ def test_crossed_dtypes_weird_corner(self):
13961396
[df, df2], keys=['one', 'two'], names=['first', 'second'])
13971397
self.assertEqual(result.index.names, ('first', 'second'))
13981398

1399+
def test_dups_index(self):
1400+
# GH 4771
1401+
1402+
# single dtypes
1403+
df = DataFrame(np.random.randint(0,10,size=40).reshape(10,4),columns=['A','A','C','C'])
1404+
1405+
result = concat([df,df],axis=1)
1406+
assert_frame_equal(result.iloc[:,:4],df)
1407+
assert_frame_equal(result.iloc[:,4:],df)
1408+
1409+
result = concat([df,df],axis=0)
1410+
assert_frame_equal(result.iloc[:10],df)
1411+
assert_frame_equal(result.iloc[10:],df)
1412+
1413+
# multi dtypes
1414+
df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
1415+
DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
1416+
axis=1)
1417+
1418+
result = concat([df,df],axis=1)
1419+
assert_frame_equal(result.iloc[:,:6],df)
1420+
assert_frame_equal(result.iloc[:,6:],df)
1421+
1422+
result = concat([df,df],axis=0)
1423+
assert_frame_equal(result.iloc[:10],df)
1424+
assert_frame_equal(result.iloc[10:],df)
1425+
1426+
# append
1427+
result = df.iloc[0:8,:].append(df.iloc[8:])
1428+
assert_frame_equal(result, df)
1429+
1430+
result = df.iloc[0:8,:].append(df.iloc[8:9]).append(df.iloc[9:10])
1431+
assert_frame_equal(result, df)
1432+
1433+
expected = concat([df,df],axis=0)
1434+
result = df.append(df)
1435+
assert_frame_equal(result, expected)
1436+
1437+
def test_join_dups(self):
1438+
df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
1439+
DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
1440+
axis=1)
1441+
1442+
expected = concat([df,df],axis=1)
1443+
result = df.join(df,rsuffix='_2')
1444+
result.columns = expected.columns
1445+
assert_frame_equal(result, expected)
1446+
13991447
def test_handle_empty_objects(self):
14001448
df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))
14011449

0 commit comments

Comments
 (0)