BUG: loc against CategoricalIndex may results in normal Index

sinhrks · sinhrks · commit 8ab61851eeae · 2015-11-21T10:11:05.000+09:00
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -105,3 +105,7 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+
+
+
+- Bug in ``.loc`` against ``CategoricalIndex`` may result in normal ``Index`` (:issue:`11586`)
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -3362,8 +3362,8 @@ def reindex(self, target, method=None, level=None, limit=None,
         # filling in missing if needed
         if len(missing):
             cats = self.categories.get_indexer(target)
-            if (cats==-1).any():
 
+            if (cats==-1).any():
                 # coerce to a regular index here!
                 result = Index(np.array(self),name=self.name)
                 new_target, indexer, _ = result._reindex_non_unique(np.array(target))
@@ -3397,6 +3397,12 @@ def _reindex_non_unique(self, target):
             new_indexer = np.arange(len(self.take(indexer)))
             new_indexer[check] = -1
 
+        cats = self.categories.get_indexer(target)
+        if not (cats == -1).any():
+            # .reindex returns normal Index. Revert to CategoricalIndex if
+            # all targets are included in my categories
+            new_target = self._shallow_copy(new_target)
+
         return new_target, indexer, new_indexer
 
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -984,6 +984,9 @@ def _getitem_iterable(self, key, axis=0):
                 # asarray can be unsafe, NumPy strings are weird
                 keyarr = _asarray_tuplesafe(key)
 
+            if com.is_categorical_dtype(labels):
+                keyarr = labels._shallow_copy(keyarr)
+
             # have the index handle the indexer and possibly return
             # an indexer or raising
             indexer = labels._convert_list_indexer(keyarr, kind=self.name)
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -2314,6 +2314,23 @@ def test_reindexing(self):
             actual = ci.get_indexer(finder)
             tm.assert_numpy_array_equal(expected, actual)
 
+    def test_reindex_dtype(self):
+        res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(['a', 'c'])
+        tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True)
+        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
+
+        res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(Categorical(['a', 'c']))
+        tm.assert_index_equal(res, CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']), exact=True)
+        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
+
+        res, indexer = CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c', 'd']).reindex(['a', 'c'])
+        tm.assert_index_equal(res, Index(['a', 'a', 'c'], dtype='object'), exact=True)
+        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
+
+        res, indexer = CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c', 'd']).reindex(Categorical(['a', 'c']))
+        tm.assert_index_equal(res, CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']), exact=True)
+        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
+
     def test_duplicates(self):
 
         idx = CategoricalIndex([0, 0, 0], name='foo')
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -4842,14 +4842,12 @@ def test_loc_listlike(self):
         # list of labels
         result = self.df.loc[['c','a']]
         expected = self.df.iloc[[4,0,1,5]]
-        assert_frame_equal(result, expected)
-
-        # ToDo: check_index_type can be True after GH XXX
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.loc[['a','b','e']]
         exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B')
         expected = DataFrame({'A' : [0,1,5,2,3,np.nan]}, index=exp_index)
-        assert_frame_equal(result, expected, check_index_type=False)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         # element in the categories but not in the values
         self.assertRaises(KeyError, lambda : self.df2.loc['e'])
@@ -4859,19 +4857,78 @@ def test_loc_listlike(self):
         df.loc['e'] = 20
         result = df.loc[['a','b','e']]
         exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B')
-        expected = DataFrame({'A' : [0,1,5,2,3,20]}, index=exp_index)
+        expected = DataFrame({'A' : [0, 1, 5, 2, 3, 20]}, index=exp_index)
         assert_frame_equal(result, expected)
 
         df = self.df2.copy()
         result = df.loc[['a','b','e']]
-        expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
-                              'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B')
-        assert_frame_equal(result, expected, check_index_type=False)
-
+        exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B')
+        expected = DataFrame({'A' : [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         # not all labels in the categories
         self.assertRaises(KeyError, lambda : self.df2.loc[['a','d']])
 
+    def test_loc_listlike_dtypes(self):
+        # GH 11586
+
+        # unique categories and codes
+        index = pd.CategoricalIndex(['a', 'b', 'c'])
+        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)
+
+        # unique slice
+        res = df.loc[['a', 'b']]
+        exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=pd.CategoricalIndex(['a', 'b']))
+        tm.assert_frame_equal(res, exp, check_index_type=True)
+
+        # duplicated slice
+        res = df.loc[['a', 'a', 'b']]
+        exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=pd.CategoricalIndex(['a', 'a', 'b']))
+        tm.assert_frame_equal(res, exp, check_index_type=True)
+
+        with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'):
+            df.loc[['a', 'x']]
+
+        # duplicated categories and codes
+        index = pd.CategoricalIndex(['a', 'b', 'a'])
+        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)
+
+        # unique slice
+        res = df.loc[['a', 'b']]
+        exp = DataFrame({'A': [1, 3, 2], 'B': [4, 6, 5]}, index=pd.CategoricalIndex(['a', 'a', 'b']))
+        tm.assert_frame_equal(res, exp, check_index_type=True)
+
+        # duplicated slice
+        res = df.loc[['a', 'a', 'b']]
+        exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [4, 6, 4, 6, 5]}, index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
+        tm.assert_frame_equal(res, exp, check_index_type=True)
+
+        with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'):
+            df.loc[['a', 'x']]
+
+        # contains unused category
+        index = pd.CategoricalIndex(['a', 'b', 'a', 'c'], categories=list('abcde'))
+        df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index)
+
+        res = df.loc[['a', 'b']]
+        exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]},
+                        index=pd.CategoricalIndex(['a', 'a', 'b'], categories=list('abcde')))
+        tm.assert_frame_equal(res, exp, check_index_type=True)
+
+        res = df.loc[['a', 'e']]
+        exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]},
+                        index=pd.CategoricalIndex(['a', 'a', 'e'], categories=list('abcde')))
+        tm.assert_frame_equal(res, exp, check_index_type=True)
+
+        # duplicated slice
+        res = df.loc[['a', 'a', 'b']]
+        exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]},
+                        index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'], categories=list('abcde')))
+        tm.assert_frame_equal(res, exp, check_index_type=True)
+
+        with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'):
+            df.loc[['a', 'x']]
+
     def test_read_only_source(self):
         # GH 10043
         rw_array = np.eye(10)
@@ -4898,22 +4955,22 @@ def test_reindexing(self):
         result = self.df2.reindex(['a','b','e'])
         expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
                               'B' : Series(list('aaabbe')) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.reindex(['a','b'])
         expected = DataFrame({'A' : [0,1,5,2,3],
                               'B' : Series(list('aaabb')) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.reindex(['e'])
         expected = DataFrame({'A' : [np.nan],
                               'B' : Series(['e']) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.reindex(['d'])
         expected = DataFrame({'A' : [np.nan],
                               'B' : Series(['d']) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         # since we are actually reindexing with a Categorical
         # then return a Categorical
@@ -4922,38 +4979,38 @@ def test_reindexing(self):
         result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats))
         expected = DataFrame({'A' : [0,1,5,np.nan],
                               'B' : Series(list('aaad')).astype('category',categories=cats) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.reindex(pd.Categorical(['a'],categories=cats))
         expected = DataFrame({'A' : [0,1,5],
                               'B' : Series(list('aaa')).astype('category',categories=cats) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.reindex(['a','b','e'])
         expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
                               'B' : Series(list('aaabbe')) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.reindex(['a','b'])
         expected = DataFrame({'A' : [0,1,5,2,3],
                               'B' : Series(list('aaabb')) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.reindex(['e'])
         expected = DataFrame({'A' : [np.nan],
                               'B' : Series(['e']) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         # give back the type of categorical that we received
         result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats,ordered=True))
         expected = DataFrame({'A' : [0,1,5,np.nan],
                               'B' : Series(list('aaad')).astype('category',categories=cats,ordered=True) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         result = self.df2.reindex(pd.Categorical(['a','d'],categories=['a','d']))
         expected = DataFrame({'A' : [0,1,5,np.nan],
                               'B' : Series(list('aaad')).astype('category',categories=['a','d']) }).set_index('B')
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_index_type=True)
 
         # passed duplicate indexers are not allowed
         self.assertRaises(ValueError, lambda : self.df2.reindex(['a','a']))