Skip to content

Commit e5b196d

Browse files
committed
Merge pull request #4757 from jreback/reindex_er
BUG: Bug with reindexing on the index with a non-unique index will now raise a ValueError
2 parents 725b195 + 407e904 commit e5b196d

File tree

8 files changed

+90
-81
lines changed

8 files changed

+90
-81
lines changed

doc/source/release.rst

+3-2
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ pandas 0.13
151151
- ``Series.isin()`` and ``DataFrame.isin()`` now raise a ``TypeError`` when
152152
passed a string (:issue:`4763`). Pass a ``list`` of one element (containing
153153
the string) instead.
154+
- Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, and ``ExcelFile``. (:issue:`4713`, :issue:`4712`)
154155

155156
**Internal Refactoring**
156157

@@ -172,7 +173,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
172173
- ``_indexed_same,reindex_like,align,where,mask``
173174
- ``fillna,replace`` (``Series`` replace is now consistent with ``DataFrame``)
174175
- ``filter`` (also added axis argument to selectively filter on a different axis)
175-
- ``reindex,reindex_axis`` (which was the biggest change to make generic)
176+
- ``reindex,reindex_axis,take``
176177
- ``truncate`` (moved to become part of ``NDFrame``)
177178

178179
- These are API changes which make ``Panel`` more consistent with ``DataFrame``
@@ -224,7 +225,6 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
224225
- Refactor of ``_get_numeric_data/_get_bool_data`` to core/generic.py, allowing Series/Panel functionaility
225226
- Refactor of Series arithmetic with time-like objects (datetime/timedelta/time
226227
etc.) into a separate, cleaned up wrapper class. (:issue:`4613`)
227-
- Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, and ``ExcelFile``. (:issue:`4713`, :issue:`4712`)
228228

229229
**Experimental Features**
230230

@@ -326,6 +326,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
326326
- Bug with using ``QUOTE_NONE`` with ``to_csv`` causing ``Exception``. (:issue:`4328`)
327327
- Bug with Series indexing not raising an error when the right-hand-side has an incorrect length (:issue:`2702`)
328328
- Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`)
329+
- Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`)
329330

330331
pandas 0.12
331332
===========

doc/source/v0.13.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ and behaviors. Series formerly subclassed directly from ``ndarray``. (:issue:`40
297297
- ``_indexed_same,reindex_like,align,where,mask``
298298
- ``fillna,replace`` (``Series`` replace is now consistent with ``DataFrame``)
299299
- ``filter`` (also added axis argument to selectively filter on a different axis)
300-
- ``reindex,reindex_axis`` (which was the biggest change to make generic)
300+
- ``reindex,reindex_axis,take``
301301
- ``truncate`` (moved to become part of ``NDFrame``)
302302

303303
- These are API changes which make ``Panel`` more consistent with ``DataFrame``

pandas/core/frame.py

+2-45
Original file line numberDiff line numberDiff line change
@@ -2267,15 +2267,15 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=NA,
22672267
limit=limit, copy_if_needed=True,
22682268
takeable=takeable)
22692269
return self._reindex_with_indexers({0: [new_index, indexer]},
2270-
copy=copy, fill_value=fill_value)
2270+
copy=copy, fill_value=fill_value, allow_dups=takeable)
22712271

22722272
def _reindex_columns(self, new_columns, copy, level, fill_value=NA,
22732273
limit=None, takeable=False):
22742274
new_columns, indexer = self.columns.reindex(new_columns, level=level,
22752275
limit=limit, copy_if_needed=True,
22762276
takeable=takeable)
22772277
return self._reindex_with_indexers({1: [new_columns, indexer]},
2278-
copy=copy, fill_value=fill_value)
2278+
copy=copy, fill_value=fill_value, allow_dups=takeable)
22792279

22802280
def _reindex_multi(self, axes, copy, fill_value):
22812281
""" we are guaranteed non-Nones in the axes! """
@@ -2513,49 +2513,6 @@ def _maybe_cast(values, labels=None):
25132513

25142514
delevel = deprecate('delevel', reset_index)
25152515

2516-
def take(self, indices, axis=0, convert=True):
2517-
"""
2518-
Analogous to ndarray.take, return DataFrame corresponding to requested
2519-
indices along an axis
2520-
2521-
Parameters
2522-
----------
2523-
indices : list / array of ints
2524-
axis : {0, 1}
2525-
convert : convert indices for negative values, check bounds, default True
2526-
mainly useful for an user routine calling
2527-
2528-
Returns
2529-
-------
2530-
taken : DataFrame
2531-
"""
2532-
2533-
# check/convert indicies here
2534-
if convert:
2535-
axis = self._get_axis_number(axis)
2536-
indices = _maybe_convert_indices(
2537-
indices, len(self._get_axis(axis)))
2538-
2539-
if self._is_mixed_type:
2540-
if axis == 0:
2541-
new_data = self._data.take(indices, axis=1, verify=False)
2542-
return DataFrame(new_data)
2543-
else:
2544-
new_columns = self.columns.take(indices)
2545-
return self.reindex(columns=new_columns)
2546-
else:
2547-
new_values = com.take_nd(self.values,
2548-
com._ensure_int64(indices),
2549-
axis=axis)
2550-
if axis == 0:
2551-
new_columns = self.columns
2552-
new_index = self.index.take(indices)
2553-
else:
2554-
new_columns = self.columns.take(indices)
2555-
new_index = self.index
2556-
return self._constructor(new_values, index=new_index,
2557-
columns=new_columns)
2558-
25592516
#----------------------------------------------------------------------
25602517
# Reindex-based selection methods
25612518

pandas/core/generic.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -862,12 +862,13 @@ def take(self, indices, axis=0, convert=True):
862862
indices = _maybe_convert_indices(
863863
indices, len(self._get_axis(axis)))
864864

865-
if axis == 0:
865+
baxis = self._get_block_manager_axis(axis)
866+
if baxis == 0:
866867
labels = self._get_axis(axis)
867868
new_items = labels.take(indices)
868-
new_data = self._data.reindex_axis(new_items, axis=0)
869+
new_data = self._data.reindex_axis(new_items, indexer=indices, axis=0)
869870
else:
870-
new_data = self._data.take(indices, axis=axis, verify=False)
871+
new_data = self._data.take(indices, axis=baxis)
871872
return self._constructor(new_data)
872873

873874
def select(self, crit, axis=0):
@@ -944,7 +945,7 @@ def drop(self, labels, axis=0, level=None):
944945
new_axis = axis.drop(labels, level=level)
945946
else:
946947
new_axis = axis.drop(labels)
947-
dropped = self.reindex(**{axis_name: new_axis})
948+
dropped = self.reindex(**{ axis_name: new_axis })
948949
try:
949950
dropped.axes[axis_].set_names(axis.names, inplace=True)
950951
except AttributeError:
@@ -1161,7 +1162,8 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
11611162
return self._reindex_with_indexers({axis: [new_index, indexer]}, method=method, fill_value=fill_value,
11621163
limit=limit, copy=copy)._propogate_attributes(self)
11631164

1164-
def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False):
1165+
def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False, allow_dups=False):
1166+
""" allow_dups indicates an internal call here """
11651167

11661168
# reindex doing multiple operations on different axes if indiciated
11671169
new_data = self._data
@@ -1183,7 +1185,7 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, lim
11831185
# TODO: speed up on homogeneous DataFrame objects
11841186
indexer = com._ensure_int64(indexer)
11851187
new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
1186-
fill_value=fill_value)
1188+
fill_value=fill_value, allow_dups=allow_dups)
11871189

11881190
elif baxis == 0 and index is not None and index is not new_data.axes[baxis]:
11891191
new_data = new_data.reindex_items(index, copy=copy,

pandas/core/indexing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,7 @@ def _reindex(keys, level=None):
668668
if axis+1 > ndim:
669669
raise AssertionError("invalid indexing error with non-unique index")
670670

671-
result = result._reindex_with_indexers({ axis : [ new_labels, new_indexer ] }, copy=True)
671+
result = result._reindex_with_indexers({ axis : [ new_labels, new_indexer ] }, copy=True, allow_dups=True)
672672

673673
return result
674674

pandas/core/internals.py

+45-8
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None
198198
raise AssertionError('axis must be at least 1, got %d' % axis)
199199
if fill_value is None:
200200
fill_value = self.fill_value
201+
201202
new_values = com.take_nd(self.values, indexer, axis,
202203
fill_value=fill_value, mask_info=mask_info)
203204
return make_block(
@@ -1515,15 +1516,28 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, fill_valu
15151516
if indexer is None:
15161517
indexer = np.arange(len(self.items))
15171518

1518-
new_values = com.take_1d(self.values.values, indexer)
1519+
# single block
1520+
if self.ndim == 1:
1521+
1522+
new_items = new_ref_items
1523+
new_values = com.take_1d(self.values.values, indexer)
1524+
1525+
else:
1526+
1527+
# if we don't overlap at all, then don't include this block
1528+
new_items = self.items & new_ref_items
1529+
if not len(new_items):
1530+
return None
1531+
1532+
new_values = self.values.values
15191533

15201534
# fill if needed
15211535
if method is not None or limit is not None:
15221536
if fill_value is None:
15231537
fill_value = self.fill_value
15241538
new_values = com.interpolate_2d(new_values, method=method, limit=limit, fill_value=fill_value)
15251539

1526-
return self.make_block(new_values, items=new_ref_items, ref_items=new_ref_items, copy=copy)
1540+
return self.make_block(new_values, items=new_items, ref_items=new_ref_items, copy=copy)
15271541

15281542
def sparse_reindex(self, new_index):
15291543
""" sparse reindex and return a new block
@@ -2718,10 +2732,14 @@ def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, fill_va
27182732
raise AssertionError('method argument not supported for '
27192733
'axis == 0')
27202734

2721-
def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None):
2735+
def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None, allow_dups=False):
27222736
"""
27232737
pandas-indexer with -1's only.
27242738
"""
2739+
# trying to reindex on an axis with duplicates
2740+
if not allow_dups and not self.axes[axis].is_unique:
2741+
raise ValueError("cannot reindex from a duplicate axis")
2742+
27252743
if axis == 0:
27262744
return self._reindex_indexer_items(new_axis, indexer, fill_value)
27272745

@@ -2789,15 +2807,34 @@ def reindex_items(self, new_items, indexer=None, copy=True, fill_value=None):
27892807
if indexer is None:
27902808
for blk in self.blocks:
27912809
if copy:
2792-
new_blocks.append(blk.reindex_items_from(new_items))
2810+
blk = blk.reindex_items_from(new_items)
27932811
else:
27942812
blk.ref_items = new_items
2813+
if blk is not None:
27952814
new_blocks.append(blk)
27962815
else:
2797-
for block in self.blocks:
2798-
newb = block.reindex_items_from(new_items, copy=copy)
2799-
if len(newb.items) > 0:
2800-
new_blocks.append(newb)
2816+
2817+
# unique
2818+
if self.axes[0].is_unique:
2819+
for block in self.blocks:
2820+
2821+
newb = block.reindex_items_from(new_items, copy=copy)
2822+
if newb is not None and len(newb.items) > 0:
2823+
new_blocks.append(newb)
2824+
2825+
# non-unique
2826+
else:
2827+
rl = self._set_ref_locs()
2828+
for i, idx in enumerate(indexer):
2829+
blk, lidx = rl[idx]
2830+
item = new_items.take([i])
2831+
blk = make_block(_block_shape(blk.iget(lidx)),
2832+
item,
2833+
new_items,
2834+
ndim=self.ndim,
2835+
fastpath=True,
2836+
placement = [i])
2837+
new_blocks.append(blk)
28012838

28022839
# add a na block if we are missing items
28032840
mask = indexer == -1

pandas/sparse/tests/test_sparse.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,9 @@ def test_getitem_slice(self):
384384
idx = self.bseries.index
385385
res = self.bseries[::2]
386386
tm.assert_isinstance(res, SparseSeries)
387-
assert_sp_series_equal(res, self.bseries.reindex(idx[::2]))
387+
388+
expected = self.bseries.reindex(idx[::2])
389+
assert_sp_series_equal(res, expected)
388390

389391
res = self.bseries[:5]
390392
tm.assert_isinstance(res, SparseSeries)

pandas/tests/test_frame.py

+27-17
Original file line numberDiff line numberDiff line change
@@ -2879,7 +2879,7 @@ def test_constructor_column_duplicates(self):
28792879
columns=['b', 'a', 'a'])
28802880

28812881

2882-
def test_column_duplicates_operations(self):
2882+
def test_column_dups_operations(self):
28832883

28842884
def check(result, expected=None):
28852885
if expected is not None:
@@ -2973,22 +2973,6 @@ def check(result, expected=None):
29732973
expected = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','hello','foo2'])
29742974
check(df,expected)
29752975

2976-
# reindex
2977-
df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
2978-
expected = DataFrame([[1],[1],[1]],columns=['bar'])
2979-
result = df.reindex(columns=['bar'])
2980-
check(result,expected)
2981-
2982-
result1 = DataFrame([[1],[1],[1]],columns=['bar']).reindex(columns=['bar','foo'])
2983-
result2 = df.reindex(columns=['bar','foo'])
2984-
check(result2,result1)
2985-
2986-
# drop
2987-
df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
2988-
df = df.drop(['a'],axis=1)
2989-
expected = DataFrame([[1],[1],[1]],columns=['bar'])
2990-
check(df,expected)
2991-
29922976
# values
29932977
df = DataFrame([[1,2.5],[3,4.5]], index=[1,2], columns=['x','x'])
29942978
result = df.values
@@ -3016,6 +3000,17 @@ def check(result, expected=None):
30163000
columns=['RT','TClose','TExg','RPT_Date','STK_ID','STK_Name','QT_Close']).set_index(['STK_ID','RPT_Date'],drop=False)
30173001
assert_frame_equal(result,expected)
30183002

3003+
# reindex is invalid!
3004+
df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
3005+
self.assertRaises(ValueError, df.reindex, columns=['bar'])
3006+
self.assertRaises(ValueError, df.reindex, columns=['bar','foo'])
3007+
3008+
# drop
3009+
df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
3010+
df = df.drop(['a'],axis=1)
3011+
expected = DataFrame([[1],[1],[1]],columns=['bar'])
3012+
check(df,expected)
3013+
30193014
def test_insert_benchmark(self):
30203015
# from the vb_suite/frame_methods/frame_insert_columns
30213016
N = 10
@@ -7573,6 +7568,21 @@ def test_reindex_fill_value(self):
75737568
expected = df.reindex(lrange(15)).fillna(0)
75747569
assert_frame_equal(result, expected)
75757570

7571+
def test_reindex_dups(self):
7572+
7573+
# GH4746, reindex on duplicate index error messages
7574+
arr = np.random.randn(10)
7575+
df = DataFrame(arr,index=[1,2,3,4,5,1,2,3,4,5])
7576+
7577+
# set index is ok
7578+
result = df.copy()
7579+
result.index = list(range(len(df)))
7580+
expected = DataFrame(arr,index=list(range(len(df))))
7581+
assert_frame_equal(result,expected)
7582+
7583+
# reindex fails
7584+
self.assertRaises(ValueError, df.reindex, index=list(range(len(df))))
7585+
75767586
def test_align(self):
75777587

75787588
af, bf = self.frame.align(self.frame)

0 commit comments

Comments
 (0)