diff --git a/doc/source/release.rst b/doc/source/release.rst index 8fba8618fd860..9be9a03b0346e 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -264,6 +264,7 @@ See :ref:`Internal Refactoring` to a possible lazay frequency inference issue (:issue:`3317`) - Fixed issue where ``DataFrame.apply`` was reraising exceptions incorrectly (causing the original stack trace to be truncated). + - Fix selection with ``ix/loc`` and non_unique selectors (:issue:`4619`) pandas 0.12 =========== diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 11818a4fea7c8..3a0123b40aa39 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -476,8 +476,12 @@ def _reindex(keys, level=None): else: level = None - if labels.is_unique and Index(keyarr).is_unique: + keyarr_is_unique = Index(keyarr).is_unique + + # existing labels are unique and indexer is unique + if labels.is_unique and keyarr_is_unique: return _reindex(keyarr, level=level) + else: indexer, missing = labels.get_indexer_non_unique(keyarr) check = indexer != -1 @@ -496,8 +500,15 @@ def _reindex(keys, level=None): new_labels = np.empty(tuple([len(indexer)]),dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels - new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values - new_indexer[missing_indexer] = -1 + + # a unique indexer + if keyarr_is_unique: + new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values + new_indexer[missing_indexer] = -1 + + # we have a non_unique selector, need to use the original indexer here + else: + new_indexer = indexer # reindex with the specified axis ndim = self.obj.ndim diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 101cbd9d6baf8..26cef3acbfad1 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -796,20 +796,21 @@ def test_dups_fancy_indexing(self): assert_frame_equal(df,result) # GH 3561, dups not in selected order - ind = ['A', 'A', 'B', 'C'] - df = DataFrame({'test':lrange(len(ind))}, index=ind) + df = DataFrame({'test': [5,7,9,11]}, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] - res = df.ix[rows] - self.assert_(rows == list(res.index)) + expected = DataFrame({'test' : [11,9]},index=rows) + result = df.ix[rows] + assert_frame_equal(result, expected) - res = df.ix[Index(rows)] - self.assert_(Index(rows).equals(res.index)) + result = df.ix[Index(rows)] + assert_frame_equal(result, expected) rows = ['C','B','E'] - res = df.ix[rows] - self.assert_(rows == list(res.index)) + expected = DataFrame({'test' : [11,9,np.nan]},index=rows) + result = df.ix[rows] + assert_frame_equal(result, expected) - # inconcistent returns for unique/duplicate indices when values are missing + # inconsistent returns for unique/duplicate indices when values are missing df = DataFrame(randn(4,3),index=list('ABCD')) expected = df.ix[['E']] @@ -817,6 +818,23 @@ def test_dups_fancy_indexing(self): result = dfnu.ix[['E']] assert_frame_equal(result, expected) + # GH 4619; duplicate indexer with missing label + df = DataFrame({"A": [0, 1, 2]}) + result = df.ix[[0,8,0]] + expected = DataFrame({"A": [0, np.nan, 0]},index=[0,8,0]) + assert_frame_equal(result,expected) + + df = DataFrame({"A": list('abc')}) + result = df.ix[[0,8,0]] + expected = DataFrame({"A": ['a', np.nan, 'a']},index=[0,8,0]) + assert_frame_equal(result,expected) + + # non unique with non unique selector + df = DataFrame({'test': [5,7,9,11]}, index=['A','A','B','C']) + expected = DataFrame({'test' : [5,7,5,7,np.nan]},index=['A','A','A','A','E']) + result = df.ix[['A','A','E']] + assert_frame_equal(result, expected) + def test_indexing_mixed_frame_bug(self): # GH3492