Skip to content

Commit 4c756e2

Browse files
committed
ENH: support for having duplicative indices across blocks (dtypes)
BUG: fix construction of a DataFrame with duplicative indices
1 parent 432c672 commit 4c756e2

File tree

5 files changed

+145
-42
lines changed

5 files changed

+145
-42
lines changed

RELEASE.rst

+6
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ pandas 0.11.1
6262
to base dtypes correctly (GH3480_)
6363
- Fix issue when storing uint dtypes in an HDFStore. (GH3493_)
6464
- Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_)
65+
- ref_locs support to allow duplicative indices across dtypes (GH3468_)
66+
- Non-unique index support clarified (GH3468_)
67+
68+
- Fix assigning a new index to a duplicate index in a DataFrame would fail
69+
- Fix construction of a DataFrame with a duplicate index
70+
- ref_locs support to allow duplicative indices across dtypes
6571

6672
.. _GH3164: https://github.com/pydata/pandas/issues/3164
6773
.. _GH3251: https://github.com/pydata/pandas/issues/3251

pandas/core/internals.py

+99-39
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,11 @@ def _gi(self, arg):
5656
@property
5757
def ref_locs(self):
5858
if self._ref_locs is None:
59-
ri = self.ref_items
60-
if ri.is_unique:
61-
indexer = ri.get_indexer(self.items)
62-
indexer = com._ensure_platform_int(indexer)
63-
if (indexer == -1).any():
64-
raise AssertionError('Some block items were not in block '
65-
'ref_items')
66-
else:
67-
indexer = np.arange(len(ri))
59+
indexer = self.ref_items.get_indexer(self.items)
60+
indexer = com._ensure_platform_int(indexer)
61+
if (indexer == -1).any():
62+
raise AssertionError('Some block items were not in block '
63+
'ref_items')
6864

6965
self._ref_locs = indexer
7066
return self._ref_locs
@@ -884,7 +880,7 @@ class BlockManager(object):
884880
-----
885881
This is *not* a public API class
886882
"""
887-
__slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated']
883+
__slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs']
888884

889885
def __init__(self, blocks, axes, do_integrity_check=True):
890886
self.axes = [_ensure_index(ax) for ax in axes]
@@ -920,11 +916,83 @@ def set_axis(self, axis, value):
920916
if len(value) != len(cur_axis):
921917
raise Exception('Length mismatch (%d vs %d)'
922918
% (len(value), len(cur_axis)))
919+
923920
self.axes[axis] = value
924921

925922
if axis == 0:
926-
for block in self.blocks:
927-
block.set_ref_items(self.items, maybe_rename=True)
923+
# unique, we can take
924+
if cur_axis.is_unique:
925+
for block in self.blocks:
926+
block.set_ref_items(self.items, maybe_rename=True)
927+
928+
# compute a duplicate indexer that we can use to take
929+
# the new items from ref_items (in place of _ref_items)
930+
else:
931+
self.set_ref_locs(cur_axis)
932+
for block in self.blocks:
933+
block.set_ref_items(self.items, maybe_rename=True)
934+
935+
def set_ref_locs(self, labels = None):
936+
# if we have a non-unique index on this axis, set the indexers
937+
# we need to set an absolute indexer for the blocks
938+
# return the indexer if we are not unique
939+
if labels is None:
940+
labels = self.items
941+
942+
if labels.is_unique:
943+
return None
944+
945+
#### THIS IS POTENTIALLY VERY SLOW #####
946+
947+
# if we are already computed, then we are done
948+
if getattr(self,'_ref_locs',None) is not None:
949+
return self._ref_locs
950+
951+
blocks = self.blocks
952+
953+
# initialize
954+
blockmap = dict()
955+
for b in blocks:
956+
arr = np.empty(len(b.items),dtype='int64')
957+
arr.fill(-1)
958+
b._ref_locs = arr
959+
960+
# add this block to the blockmap for each
961+
# of the items in the block
962+
for item in b.items:
963+
if item not in blockmap:
964+
blockmap[item] = []
965+
blockmap[item].append(b)
966+
967+
rl = np.empty(len(labels),dtype=object)
968+
for i, item in enumerate(labels.values):
969+
970+
try:
971+
block = blockmap[item].pop(0)
972+
except:
973+
raise Exception("not enough items in set_ref_locs")
974+
975+
indexer = np.arange(len(block.items))
976+
mask = (block.items == item) & (block._ref_locs == -1)
977+
if not mask.any():
978+
979+
# this case will catch a comparison of a index of tuples
980+
mask = np.empty(len(block.items),dtype=bool)
981+
mask.fill(False)
982+
for j, (bitem, brl) in enumerate(zip(block.items,block._ref_locs)):
983+
mask[j] = bitem == item and brl == -1
984+
985+
indices = indexer[mask]
986+
if len(indices):
987+
idx = indices[0]
988+
else:
989+
raise Exception("already set too many items in set_ref_locs")
990+
991+
block._ref_locs[idx] = i
992+
rl[i] = (block,idx)
993+
994+
self._ref_locs = rl
995+
return rl
928996

929997
# make items read only for now
930998
def _get_items(self):
@@ -1392,26 +1460,11 @@ def iget(self, i):
13921460
item = self.items[i]
13931461
if self.items.is_unique:
13941462
return self.get(item)
1395-
else:
1396-
# ugh
1397-
try:
1398-
inds, = (self.items == item).nonzero()
1399-
except AttributeError: # MultiIndex
1400-
inds, = self.items.map(lambda x: x == item).nonzero()
1401-
1402-
_, block = self._find_block(item)
1403-
1404-
try:
1405-
binds, = (block.items == item).nonzero()
1406-
except AttributeError: # MultiIndex
1407-
binds, = block.items.map(lambda x: x == item).nonzero()
14081463

1409-
for j, (k, b) in enumerate(zip(inds, binds)):
1410-
if i == k:
1411-
return block.values[b]
1412-
1413-
raise Exception('Cannot have duplicate column names '
1414-
'split across dtypes')
1464+
# compute the duplicative indexer if needed
1465+
ref_locs = self.set_ref_locs()
1466+
b, loc = ref_locs[i]
1467+
return b.values[loc]
14151468

14161469
def get_scalar(self, tup):
14171470
"""
@@ -1587,6 +1640,8 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
15871640
# keep track of what items aren't found anywhere
15881641
mask = np.zeros(len(item_order), dtype=bool)
15891642

1643+
new_axes = [new_items] + self.axes[1:]
1644+
15901645
new_blocks = []
15911646
for blk in self.blocks:
15921647
blk_indexer = blk.items.get_indexer(item_order)
@@ -1610,7 +1665,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value):
16101665
new_blocks.append(na_block)
16111666
new_blocks = _consolidate(new_blocks, new_items)
16121667

1613-
return BlockManager(new_blocks, [new_items] + self.axes[1:])
1668+
return BlockManager(new_blocks, new_axes)
16141669

16151670
def reindex_items(self, new_items, copy=True, fill_value=np.nan):
16161671
"""
@@ -1624,6 +1679,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
16241679

16251680
# TODO: this part could be faster (!)
16261681
new_items, indexer = self.items.reindex(new_items)
1682+
new_axes = [new_items] + self.axes[1:]
16271683

16281684
# could have so me pathological (MultiIndex) issues here
16291685
new_blocks = []
@@ -1648,7 +1704,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan):
16481704
new_blocks.append(na_block)
16491705
new_blocks = _consolidate(new_blocks, new_items)
16501706

1651-
return BlockManager(new_blocks, [new_items] + self.axes[1:])
1707+
return BlockManager(new_blocks, new_axes)
16521708

16531709
def _make_na_block(self, items, ref_items, fill_value=np.nan):
16541710
# TODO: infer dtypes other than float64 from fill_value
@@ -1690,11 +1746,11 @@ def merge(self, other, lsuffix=None, rsuffix=None):
16901746
this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
16911747

16921748
cons_items = this.items + other.items
1693-
consolidated = _consolidate(this.blocks + other.blocks, cons_items)
1694-
16951749
new_axes = list(this.axes)
16961750
new_axes[0] = cons_items
16971751

1752+
consolidated = _consolidate(this.blocks + other.blocks, cons_items)
1753+
16981754
return BlockManager(consolidated, new_axes)
16991755

17001756
def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
@@ -1907,7 +1963,6 @@ def form_blocks(arrays, names, axes):
19071963

19081964
na_block = make_block(block_values, extra_items, items)
19091965
blocks.append(na_block)
1910-
blocks = _consolidate(blocks, items)
19111966

19121967
return blocks
19131968

@@ -1958,16 +2013,21 @@ def _shape_compat(x):
19582013

19592014
names, arrays = zip(*tuples)
19602015

1961-
# index may box values
1962-
items = ref_items[ref_items.isin(names)]
1963-
19642016
first = arrays[0]
19652017
shape = (len(arrays),) + _shape_compat(first)
19662018

19672019
stacked = np.empty(shape, dtype=dtype)
19682020
for i, arr in enumerate(arrays):
19692021
stacked[i] = _asarray_compat(arr)
19702022

2023+
# index may box values
2024+
if ref_items.is_unique:
2025+
items = ref_items[ref_items.isin(names)]
2026+
else:
2027+
items = _ensure_index([ n for n in names if n in ref_items ])
2028+
if len(items) != len(stacked):
2029+
raise Exception("invalid names passed _stack_arrays")
2030+
19712031
return items, stacked
19722032

19732033

pandas/tests/test_frame.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -9204,18 +9204,48 @@ def test_assign_columns(self):
92049204
def test_assign_columns_with_dups(self):
92059205

92069206
# GH 3468 related
9207+
9208+
# basic
92079209
df = DataFrame([[1,2]], columns=['a','a'])
92089210
df.columns = ['a','a.1']
9209-
9211+
str(df)
92109212
expected = DataFrame([[1,2]], columns=['a','a.1'])
92119213
assert_frame_equal(df, expected)
92129214

9215+
df = DataFrame([[1,2,3]], columns=['b','a','a'])
9216+
df.columns = ['b','a','a.1']
9217+
str(df)
9218+
expected = DataFrame([[1,2,3]], columns=['b','a','a.1'])
9219+
assert_frame_equal(df, expected)
9220+
9221+
# with a dup index
92139222
df = DataFrame([[1,2]], columns=['a','a'])
92149223
df.columns = ['b','b']
9215-
9224+
str(df)
92169225
expected = DataFrame([[1,2]], columns=['b','b'])
92179226
assert_frame_equal(df, expected)
92189227

9228+
# multi-dtype
9229+
df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=['a','a','b','b','d','c','c'])
9230+
df.columns = list('ABCDEFG')
9231+
str(df)
9232+
expected = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('ABCDEFG'))
9233+
assert_frame_equal(df, expected)
9234+
9235+
# this is an error because we cannot disambiguate the dup columns
9236+
self.assertRaises(Exception, lambda x: DataFrame([[1,2,'foo','bar']], columns=['a','a','a','a']))
9237+
9238+
# dups across blocks
9239+
df_float = DataFrame(np.random.randn(10, 3),dtype='float64')
9240+
df_int = DataFrame(np.random.randn(10, 3),dtype='int64')
9241+
df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns)
9242+
df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns)
9243+
df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns)
9244+
df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1)
9245+
9246+
result = df._data.set_ref_locs()
9247+
self.assert_(len(result) == len(df.columns))
9248+
92199249
def test_cast_internals(self):
92209250
casted = DataFrame(self.frame._data, dtype=int)
92219251
expected = DataFrame(self.frame._series, dtype=int)

pandas/tests/test_indexing.py

+7
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,13 @@ def test_dups_fancy_indexing(self):
772772
expected = Index(['b','a','a'])
773773
self.assert_(result.equals(expected))
774774

775+
# across dtypes
776+
df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa'))
777+
result = DataFrame([[1,2,1.,2.,3.,'foo','bar']])
778+
result.columns = list('aaaaaaa')
779+
assert_frame_equal(df,result)
780+
781+
775782
if __name__ == '__main__':
776783
import nose
777784
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/tests/test_internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ def test_duplicate_item_failure(self):
268268
b.ref_items = items
269269

270270
mgr = BlockManager(blocks, [items, np.arange(N)])
271-
self.assertRaises(Exception, mgr.iget, 1)
271+
mgr.iget(1)
272272

273273
def test_contains(self):
274274
self.assert_('a' in self.mgr)

0 commit comments

Comments
 (0)