From 8ab61851eeae0d16deaa2f6eafcf59393b06aaeb Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 14 Nov 2015 16:37:17 +0900 Subject: [PATCH] BUG: loc against CategoricalIndex may results in normal Index --- doc/source/whatsnew/v0.18.0.txt | 4 ++ pandas/core/index.py | 8 ++- pandas/core/indexing.py | 3 + pandas/tests/test_index.py | 17 ++++++ pandas/tests/test_indexing.py | 97 ++++++++++++++++++++++++++------- 5 files changed, 108 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 5ccf829fd5a42..ee0cd0bf3c5ff 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -105,3 +105,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + + + +- Bug in ``.loc`` against ``CategoricalIndex`` may result in normal ``Index`` (:issue:`11586`) diff --git a/pandas/core/index.py b/pandas/core/index.py index cdd0de4e1196d..2099c1996b66b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3362,8 +3362,8 @@ def reindex(self, target, method=None, level=None, limit=None, # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) - if (cats==-1).any(): + if (cats==-1).any(): # coerce to a regular index here! result = Index(np.array(self),name=self.name) new_target, indexer, _ = result._reindex_non_unique(np.array(target)) @@ -3397,6 +3397,12 @@ def _reindex_non_unique(self, target): new_indexer = np.arange(len(self.take(indexer))) new_indexer[check] = -1 + cats = self.categories.get_indexer(target) + if not (cats == -1).any(): + # .reindex returns normal Index. Revert to CategoricalIndex if + # all targets are included in my categories + new_target = self._shallow_copy(new_target) + return new_target, indexer, new_indexer def get_indexer(self, target, method=None, limit=None, tolerance=None): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 2b1cb0a1e1b31..9df72053fb0af 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -984,6 +984,9 @@ def _getitem_iterable(self, key, axis=0): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) + if com.is_categorical_dtype(labels): + keyarr = labels._shallow_copy(keyarr) + # have the index handle the indexer and possibly return # an indexer or raising indexer = labels._convert_list_indexer(keyarr, kind=self.name) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 43a8b801fa813..bc9d303dc3b1a 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -2314,6 +2314,23 @@ def test_reindexing(self): actual = ci.get_indexer(finder) tm.assert_numpy_array_equal(expected, actual) + def test_reindex_dtype(self): + res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(['a', 'c']) + tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2])) + + res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(Categorical(['a', 'c'])) + tm.assert_index_equal(res, CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2])) + + res, indexer = CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c', 'd']).reindex(['a', 'c']) + tm.assert_index_equal(res, Index(['a', 'a', 'c'], dtype='object'), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2])) + + res, indexer = CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c', 'd']).reindex(Categorical(['a', 'c'])) + tm.assert_index_equal(res, CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2])) + def test_duplicates(self): idx = CategoricalIndex([0, 0, 0], name='foo') diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index cb06b714d4700..66850ab29af39 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -4842,14 +4842,12 @@ def test_loc_listlike(self): # list of labels result = self.df.loc[['c','a']] expected = self.df.iloc[[4,0,1,5]] - assert_frame_equal(result, expected) - - # ToDo: check_index_type can be True after GH XXX + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.loc[['a','b','e']] exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B') expected = DataFrame({'A' : [0,1,5,2,3,np.nan]}, index=exp_index) - assert_frame_equal(result, expected, check_index_type=False) + assert_frame_equal(result, expected, check_index_type=True) # element in the categories but not in the values self.assertRaises(KeyError, lambda : self.df2.loc['e']) @@ -4859,19 +4857,78 @@ def test_loc_listlike(self): df.loc['e'] = 20 result = df.loc[['a','b','e']] exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B') - expected = DataFrame({'A' : [0,1,5,2,3,20]}, index=exp_index) + expected = DataFrame({'A' : [0, 1, 5, 2, 3, 20]}, index=exp_index) assert_frame_equal(result, expected) df = self.df2.copy() result = df.loc[['a','b','e']] - expected = DataFrame({'A' : [0,1,5,2,3,np.nan], - 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') - assert_frame_equal(result, expected, check_index_type=False) - + exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B') + expected = DataFrame({'A' : [0, 1, 5, 2, 3, np.nan]}, index=exp_index) + assert_frame_equal(result, expected, check_index_type=True) # not all labels in the categories self.assertRaises(KeyError, lambda : self.df2.loc[['a','d']]) + def test_loc_listlike_dtypes(self): + # GH 11586 + + # unique categories and codes + index = pd.CategoricalIndex(['a', 'b', 'c']) + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) + + # unique slice + res = df.loc[['a', 'b']] + exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=pd.CategoricalIndex(['a', 'b'])) + tm.assert_frame_equal(res, exp, check_index_type=True) + + # duplicated slice + res = df.loc[['a', 'a', 'b']] + exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=pd.CategoricalIndex(['a', 'a', 'b'])) + tm.assert_frame_equal(res, exp, check_index_type=True) + + with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'): + df.loc[['a', 'x']] + + # duplicated categories and codes + index = pd.CategoricalIndex(['a', 'b', 'a']) + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) + + # unique slice + res = df.loc[['a', 'b']] + exp = DataFrame({'A': [1, 3, 2], 'B': [4, 6, 5]}, index=pd.CategoricalIndex(['a', 'a', 'b'])) + tm.assert_frame_equal(res, exp, check_index_type=True) + + # duplicated slice + res = df.loc[['a', 'a', 'b']] + exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [4, 6, 4, 6, 5]}, index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'])) + tm.assert_frame_equal(res, exp, check_index_type=True) + + with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'): + df.loc[['a', 'x']] + + # contains unused category + index = pd.CategoricalIndex(['a', 'b', 'a', 'c'], categories=list('abcde')) + df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index) + + res = df.loc[['a', 'b']] + exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]}, + index=pd.CategoricalIndex(['a', 'a', 'b'], categories=list('abcde'))) + tm.assert_frame_equal(res, exp, check_index_type=True) + + res = df.loc[['a', 'e']] + exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]}, + index=pd.CategoricalIndex(['a', 'a', 'e'], categories=list('abcde'))) + tm.assert_frame_equal(res, exp, check_index_type=True) + + # duplicated slice + res = df.loc[['a', 'a', 'b']] + exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]}, + index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'], categories=list('abcde'))) + tm.assert_frame_equal(res, exp, check_index_type=True) + + with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'): + df.loc[['a', 'x']] + def test_read_only_source(self): # GH 10043 rw_array = np.eye(10) @@ -4898,22 +4955,22 @@ def test_reindexing(self): result = self.df2.reindex(['a','b','e']) expected = DataFrame({'A' : [0,1,5,2,3,np.nan], 'B' : Series(list('aaabbe')) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.reindex(['a','b']) expected = DataFrame({'A' : [0,1,5,2,3], 'B' : Series(list('aaabb')) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.reindex(['e']) expected = DataFrame({'A' : [np.nan], 'B' : Series(['e']) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.reindex(['d']) expected = DataFrame({'A' : [np.nan], 'B' : Series(['d']) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) # since we are actually reindexing with a Categorical # then return a Categorical @@ -4922,38 +4979,38 @@ def test_reindexing(self): result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats)) expected = DataFrame({'A' : [0,1,5,np.nan], 'B' : Series(list('aaad')).astype('category',categories=cats) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.reindex(pd.Categorical(['a'],categories=cats)) expected = DataFrame({'A' : [0,1,5], 'B' : Series(list('aaa')).astype('category',categories=cats) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.reindex(['a','b','e']) expected = DataFrame({'A' : [0,1,5,2,3,np.nan], 'B' : Series(list('aaabbe')) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.reindex(['a','b']) expected = DataFrame({'A' : [0,1,5,2,3], 'B' : Series(list('aaabb')) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.reindex(['e']) expected = DataFrame({'A' : [np.nan], 'B' : Series(['e']) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats,ordered=True)) expected = DataFrame({'A' : [0,1,5,np.nan], 'B' : Series(list('aaad')).astype('category',categories=cats,ordered=True) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) result = self.df2.reindex(pd.Categorical(['a','d'],categories=['a','d'])) expected = DataFrame({'A' : [0,1,5,np.nan], 'B' : Series(list('aaad')).astype('category',categories=['a','d']) }).set_index('B') - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_index_type=True) # passed duplicate indexers are not allowed self.assertRaises(ValueError, lambda : self.df2.reindex(['a','a']))