Skip to content

Commit 1c0b105

Browse files
committed
BUG: not setting placement on reindex_with_indexers
ENH: extend index.reindex to handle non_unique indicies (rather than raising) TST: more tests/optimizations for dup_columns
1 parent 8bcf581 commit 1c0b105

File tree

6 files changed

+62
-20
lines changed

6 files changed

+62
-20
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ pandas 0.11.1
136136
- Concat to produce a non-unique columns when duplicates are across dtypes is fixed (GH3602_)
137137
- Non-unique indexing with a slice via ``loc`` and friends fixed (GH3659_)
138138
- Allow insert/delete to non-unique columns (GH3679_)
139+
- Extend ``reindex`` to correctly deal with non-unique indices (GH3679_)
139140
- Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_)
140141
- Fixed bug in mixed-frame assignment with aligned series (GH3492_)
141142
- Fixed bug in selecting month/quarter/year from a series would not select the time element

pandas/core/index.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -940,8 +940,15 @@ def reindex(self, target, method=None, level=None, limit=None):
940940
if self.equals(target):
941941
indexer = None
942942
else:
943-
indexer = self.get_indexer(target, method=method,
944-
limit=limit)
943+
if self.is_unique:
944+
indexer = self.get_indexer(target, method=method,
945+
limit=limit)
946+
else:
947+
if method is not None or limit is not None:
948+
raise ValueError("cannot reindex a non-unique index "
949+
"with a method or limit")
950+
indexer, missing = self.get_indexer_non_unique(target)
951+
945952
return target, indexer
946953

947954
def join(self, other, how='left', level=None, return_indexers=False):

pandas/core/indexing.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ def _reindex(keys, level=None):
457457
else:
458458
level = None
459459

460-
if labels.is_unique:
460+
if labels.is_unique and Index(keyarr).is_unique:
461461
return _reindex(keyarr, level=level)
462462
else:
463463
indexer, missing = labels.get_indexer_non_unique(keyarr)
@@ -991,7 +991,6 @@ def _slice(self, indexer, axis=0):
991991
def _setitem_with_indexer(self, indexer, value):
992992
self.obj._set_values(indexer, value)
993993

994-
995994
def _check_bool_indexer(ax, key):
996995
# boolean indexing, need to check that the data are aligned, otherwise
997996
# disallowed
@@ -1010,7 +1009,6 @@ def _check_bool_indexer(ax, key):
10101009
result = np.asarray(result, dtype=bool)
10111010
return result
10121011

1013-
10141012
def _is_series(obj):
10151013
from pandas.core.series import Series
10161014
return isinstance(obj, Series)

pandas/core/internals.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def set_ref_locs(self, placement):
8080
if placement is None:
8181
self._ref_locs = None
8282
else:
83-
self._ref_locs = np.array(placement,dtype='int64').copy()
83+
self._ref_locs = np.array(placement,dtype='int64', copy=True)
8484

8585
def set_ref_items(self, ref_items, maybe_rename=True):
8686
"""
@@ -154,7 +154,8 @@ def reindex_axis(self, indexer, axis=1, fill_value=np.nan, mask_info=None):
154154
raise AssertionError('axis must be at least 1, got %d' % axis)
155155
new_values = com.take_nd(self.values, indexer, axis,
156156
fill_value=fill_value, mask_info=mask_info)
157-
return make_block(new_values, self.items, self.ref_items, fastpath=True)
157+
return make_block(new_values, self.items, self.ref_items, fastpath=True,
158+
placement=self._ref_locs)
158159

159160
def reindex_items_from(self, new_ref_items, copy=True):
160161
"""
@@ -168,6 +169,7 @@ def reindex_items_from(self, new_ref_items, copy=True):
168169
reindexed : Block
169170
"""
170171
new_ref_items, indexer = self.items.reindex(new_ref_items)
172+
171173
if indexer is None:
172174
new_items = new_ref_items
173175
new_values = self.values.copy() if copy else self.values
@@ -1078,25 +1080,25 @@ def _set_ref_locs(self, labels=None, do_refs=False):
10781080
10791081
"""
10801082

1081-
im = None
10821083
if labels is None:
10831084
labels = self.items
1084-
else:
1085-
_ensure_index(labels)
10861085

10871086
# we are unique, and coming from a unique
1088-
if labels.is_unique and not do_refs:
1087+
is_unique = labels.is_unique
1088+
if is_unique and not do_refs:
10891089

1090-
# reset our ref locs
1091-
self._ref_locs = None
1092-
for b in self.blocks:
1093-
b._ref_locs = None
1090+
if not self.items.is_unique:
1091+
1092+
# reset our ref locs
1093+
self._ref_locs = None
1094+
for b in self.blocks:
1095+
b._ref_locs = None
10941096

10951097
return None
10961098

10971099
# we are going to a non-unique index
10981100
# we have ref_locs on the block at this point
1099-
if (not labels.is_unique and do_refs) or do_refs=='force':
1101+
if (not is_unique and do_refs) or do_refs=='force':
11001102

11011103
# create the items map
11021104
im = getattr(self,'_items_map',None)
@@ -1972,17 +1974,18 @@ def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=np.nan):
19721974
def _reindex_indexer_items(self, new_items, indexer, fill_value):
19731975
# TODO: less efficient than I'd like
19741976

1977+
is_unique = self.items.is_unique
19751978
item_order = com.take_1d(self.items.values, indexer)
19761979

19771980
# keep track of what items aren't found anywhere
19781981
mask = np.zeros(len(item_order), dtype=bool)
1979-
19801982
new_axes = [new_items] + self.axes[1:]
19811983

19821984
new_blocks = []
19831985
for blk in self.blocks:
19841986
blk_indexer = blk.items.get_indexer(item_order)
19851987
selector = blk_indexer != -1
1988+
19861989
# update with observed items
19871990
mask |= selector
19881991

@@ -2142,7 +2145,7 @@ def rename_axis(self, mapper, axis=1):
21422145

21432146
def rename_items(self, mapper, copydata=True):
21442147
new_items = Index([mapper(x) for x in self.items])
2145-
new_items.is_unique
2148+
is_unique = new_items.is_unique
21462149

21472150
new_blocks = []
21482151
for block in self.blocks:

pandas/tests/test_frame.py

+33
Original file line numberDiff line numberDiff line change
@@ -2860,6 +2860,12 @@ def check(result, expected=None):
28602860
expected = DataFrame([[1,1,1,5,'bah',3],[1,1,2,5,'bah',3],[2,1,3,5,'bah',3]],columns=['foo','bar','foo','hello','string','foo2'])
28612861
check(df,expected)
28622862

2863+
# set (non-dup)
2864+
df['foo2'] = 4
2865+
expected = DataFrame([[1,1,1,5,'bah',4],[1,1,2,5,'bah',4],[2,1,3,5,'bah',4]],columns=['foo','bar','foo','hello','string','foo2'])
2866+
check(df,expected)
2867+
df['foo2'] = 3
2868+
28632869
# delete (non dup)
28642870
del df['bar']
28652871
expected = DataFrame([[1,1,5,'bah',3],[1,2,5,'bah',3],[2,3,5,'bah',3]],columns=['foo','foo','hello','string','foo2'])
@@ -2912,6 +2918,33 @@ def check(result, expected=None):
29122918
expected = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','hello','foo2'])
29132919
check(df,expected)
29142920

2921+
# reindex
2922+
df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
2923+
expected = DataFrame([[1],[1],[1]],columns=['bar'])
2924+
result = df.reindex(columns=['bar'])
2925+
check(result,expected)
2926+
2927+
result1 = DataFrame([[1],[1],[1]],columns=['bar']).reindex(columns=['bar','foo'])
2928+
result2 = df.reindex(columns=['bar','foo'])
2929+
check(result2,result1)
2930+
2931+
# drop
2932+
df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
2933+
df = df.drop(['a'],axis=1)
2934+
expected = DataFrame([[1],[1],[1]],columns=['bar'])
2935+
check(df,expected)
2936+
2937+
def test_insert_benchmark(self):
2938+
# from the vb_suite/frame_methods/frame_insert_columns
2939+
N = 10
2940+
K = 5
2941+
df = DataFrame(index=range(N))
2942+
new_col = np.random.randn(N)
2943+
for i in range(K):
2944+
df[i] = new_col
2945+
expected = DataFrame(np.repeat(new_col,K).reshape(N,K),index=range(N))
2946+
assert_frame_equal(df,expected)
2947+
29152948
def test_constructor_single_value(self):
29162949

29172950
# expecting single value upcasting here

pandas/tools/merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -781,10 +781,10 @@ def _upcast_blocks(blocks):
781781
for block in blocks:
782782
if isinstance(block, IntBlock):
783783
newb = make_block(block.values.astype(float), block.items,
784-
block.ref_items)
784+
block.ref_items, placement=block._ref_locs)
785785
elif isinstance(block, BoolBlock):
786786
newb = make_block(block.values.astype(object), block.items,
787-
block.ref_items)
787+
block.ref_items, placement=block._ref_locs)
788788
else:
789789
newb = block
790790
new_blocks.append(newb)

0 commit comments

Comments
 (0)