Skip to content

Commit 8ab6185

Browse files
committed
BUG: loc against CategoricalIndex may results in normal Index
1 parent 5823a6d commit 8ab6185

File tree

5 files changed

+108
-21
lines changed

5 files changed

+108
-21
lines changed

doc/source/whatsnew/v0.18.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,7 @@ Performance Improvements
105105

106106
Bug Fixes
107107
~~~~~~~~~
108+
109+
110+
111+
- Bug in ``.loc`` against ``CategoricalIndex`` may result in normal ``Index`` (:issue:`11586`)

pandas/core/index.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -3362,8 +3362,8 @@ def reindex(self, target, method=None, level=None, limit=None,
33623362
# filling in missing if needed
33633363
if len(missing):
33643364
cats = self.categories.get_indexer(target)
3365-
if (cats==-1).any():
33663365

3366+
if (cats==-1).any():
33673367
# coerce to a regular index here!
33683368
result = Index(np.array(self),name=self.name)
33693369
new_target, indexer, _ = result._reindex_non_unique(np.array(target))
@@ -3397,6 +3397,12 @@ def _reindex_non_unique(self, target):
33973397
new_indexer = np.arange(len(self.take(indexer)))
33983398
new_indexer[check] = -1
33993399

3400+
cats = self.categories.get_indexer(target)
3401+
if not (cats == -1).any():
3402+
# .reindex returns normal Index. Revert to CategoricalIndex if
3403+
# all targets are included in my categories
3404+
new_target = self._shallow_copy(new_target)
3405+
34003406
return new_target, indexer, new_indexer
34013407

34023408
def get_indexer(self, target, method=None, limit=None, tolerance=None):

pandas/core/indexing.py

+3
Original file line numberDiff line numberDiff line change
@@ -984,6 +984,9 @@ def _getitem_iterable(self, key, axis=0):
984984
# asarray can be unsafe, NumPy strings are weird
985985
keyarr = _asarray_tuplesafe(key)
986986

987+
if com.is_categorical_dtype(labels):
988+
keyarr = labels._shallow_copy(keyarr)
989+
987990
# have the index handle the indexer and possibly return
988991
# an indexer or raising
989992
indexer = labels._convert_list_indexer(keyarr, kind=self.name)

pandas/tests/test_index.py

+17
Original file line numberDiff line numberDiff line change
@@ -2314,6 +2314,23 @@ def test_reindexing(self):
23142314
actual = ci.get_indexer(finder)
23152315
tm.assert_numpy_array_equal(expected, actual)
23162316

2317+
def test_reindex_dtype(self):
2318+
res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(['a', 'c'])
2319+
tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True)
2320+
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
2321+
2322+
res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(Categorical(['a', 'c']))
2323+
tm.assert_index_equal(res, CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']), exact=True)
2324+
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
2325+
2326+
res, indexer = CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c', 'd']).reindex(['a', 'c'])
2327+
tm.assert_index_equal(res, Index(['a', 'a', 'c'], dtype='object'), exact=True)
2328+
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
2329+
2330+
res, indexer = CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c', 'd']).reindex(Categorical(['a', 'c']))
2331+
tm.assert_index_equal(res, CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']), exact=True)
2332+
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
2333+
23172334
def test_duplicates(self):
23182335

23192336
idx = CategoricalIndex([0, 0, 0], name='foo')

pandas/tests/test_indexing.py

+77-20
Original file line numberDiff line numberDiff line change
@@ -4842,14 +4842,12 @@ def test_loc_listlike(self):
48424842
# list of labels
48434843
result = self.df.loc[['c','a']]
48444844
expected = self.df.iloc[[4,0,1,5]]
4845-
assert_frame_equal(result, expected)
4846-
4847-
# ToDo: check_index_type can be True after GH XXX
4845+
assert_frame_equal(result, expected, check_index_type=True)
48484846

48494847
result = self.df2.loc[['a','b','e']]
48504848
exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B')
48514849
expected = DataFrame({'A' : [0,1,5,2,3,np.nan]}, index=exp_index)
4852-
assert_frame_equal(result, expected, check_index_type=False)
4850+
assert_frame_equal(result, expected, check_index_type=True)
48534851

48544852
# element in the categories but not in the values
48554853
self.assertRaises(KeyError, lambda : self.df2.loc['e'])
@@ -4859,19 +4857,78 @@ def test_loc_listlike(self):
48594857
df.loc['e'] = 20
48604858
result = df.loc[['a','b','e']]
48614859
exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B')
4862-
expected = DataFrame({'A' : [0,1,5,2,3,20]}, index=exp_index)
4860+
expected = DataFrame({'A' : [0, 1, 5, 2, 3, 20]}, index=exp_index)
48634861
assert_frame_equal(result, expected)
48644862

48654863
df = self.df2.copy()
48664864
result = df.loc[['a','b','e']]
4867-
expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
4868-
'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B')
4869-
assert_frame_equal(result, expected, check_index_type=False)
4870-
4865+
exp_index = pd.CategoricalIndex(list('aaabbe'), categories=list('cabe'), name='B')
4866+
expected = DataFrame({'A' : [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
4867+
assert_frame_equal(result, expected, check_index_type=True)
48714868

48724869
# not all labels in the categories
48734870
self.assertRaises(KeyError, lambda : self.df2.loc[['a','d']])
48744871

4872+
def test_loc_listlike_dtypes(self):
4873+
# GH 11586
4874+
4875+
# unique categories and codes
4876+
index = pd.CategoricalIndex(['a', 'b', 'c'])
4877+
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)
4878+
4879+
# unique slice
4880+
res = df.loc[['a', 'b']]
4881+
exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=pd.CategoricalIndex(['a', 'b']))
4882+
tm.assert_frame_equal(res, exp, check_index_type=True)
4883+
4884+
# duplicated slice
4885+
res = df.loc[['a', 'a', 'b']]
4886+
exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=pd.CategoricalIndex(['a', 'a', 'b']))
4887+
tm.assert_frame_equal(res, exp, check_index_type=True)
4888+
4889+
with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'):
4890+
df.loc[['a', 'x']]
4891+
4892+
# duplicated categories and codes
4893+
index = pd.CategoricalIndex(['a', 'b', 'a'])
4894+
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)
4895+
4896+
# unique slice
4897+
res = df.loc[['a', 'b']]
4898+
exp = DataFrame({'A': [1, 3, 2], 'B': [4, 6, 5]}, index=pd.CategoricalIndex(['a', 'a', 'b']))
4899+
tm.assert_frame_equal(res, exp, check_index_type=True)
4900+
4901+
# duplicated slice
4902+
res = df.loc[['a', 'a', 'b']]
4903+
exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [4, 6, 4, 6, 5]}, index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
4904+
tm.assert_frame_equal(res, exp, check_index_type=True)
4905+
4906+
with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'):
4907+
df.loc[['a', 'x']]
4908+
4909+
# contains unused category
4910+
index = pd.CategoricalIndex(['a', 'b', 'a', 'c'], categories=list('abcde'))
4911+
df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index)
4912+
4913+
res = df.loc[['a', 'b']]
4914+
exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]},
4915+
index=pd.CategoricalIndex(['a', 'a', 'b'], categories=list('abcde')))
4916+
tm.assert_frame_equal(res, exp, check_index_type=True)
4917+
4918+
res = df.loc[['a', 'e']]
4919+
exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]},
4920+
index=pd.CategoricalIndex(['a', 'a', 'e'], categories=list('abcde')))
4921+
tm.assert_frame_equal(res, exp, check_index_type=True)
4922+
4923+
# duplicated slice
4924+
res = df.loc[['a', 'a', 'b']]
4925+
exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]},
4926+
index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'], categories=list('abcde')))
4927+
tm.assert_frame_equal(res, exp, check_index_type=True)
4928+
4929+
with tm.assertRaisesRegexp(KeyError, 'a list-indexer must only include values that are in the categories'):
4930+
df.loc[['a', 'x']]
4931+
48754932
def test_read_only_source(self):
48764933
# GH 10043
48774934
rw_array = np.eye(10)
@@ -4898,22 +4955,22 @@ def test_reindexing(self):
48984955
result = self.df2.reindex(['a','b','e'])
48994956
expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
49004957
'B' : Series(list('aaabbe')) }).set_index('B')
4901-
assert_frame_equal(result, expected)
4958+
assert_frame_equal(result, expected, check_index_type=True)
49024959

49034960
result = self.df2.reindex(['a','b'])
49044961
expected = DataFrame({'A' : [0,1,5,2,3],
49054962
'B' : Series(list('aaabb')) }).set_index('B')
4906-
assert_frame_equal(result, expected)
4963+
assert_frame_equal(result, expected, check_index_type=True)
49074964

49084965
result = self.df2.reindex(['e'])
49094966
expected = DataFrame({'A' : [np.nan],
49104967
'B' : Series(['e']) }).set_index('B')
4911-
assert_frame_equal(result, expected)
4968+
assert_frame_equal(result, expected, check_index_type=True)
49124969

49134970
result = self.df2.reindex(['d'])
49144971
expected = DataFrame({'A' : [np.nan],
49154972
'B' : Series(['d']) }).set_index('B')
4916-
assert_frame_equal(result, expected)
4973+
assert_frame_equal(result, expected, check_index_type=True)
49174974

49184975
# since we are actually reindexing with a Categorical
49194976
# then return a Categorical
@@ -4922,38 +4979,38 @@ def test_reindexing(self):
49224979
result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats))
49234980
expected = DataFrame({'A' : [0,1,5,np.nan],
49244981
'B' : Series(list('aaad')).astype('category',categories=cats) }).set_index('B')
4925-
assert_frame_equal(result, expected)
4982+
assert_frame_equal(result, expected, check_index_type=True)
49264983

49274984
result = self.df2.reindex(pd.Categorical(['a'],categories=cats))
49284985
expected = DataFrame({'A' : [0,1,5],
49294986
'B' : Series(list('aaa')).astype('category',categories=cats) }).set_index('B')
4930-
assert_frame_equal(result, expected)
4987+
assert_frame_equal(result, expected, check_index_type=True)
49314988

49324989
result = self.df2.reindex(['a','b','e'])
49334990
expected = DataFrame({'A' : [0,1,5,2,3,np.nan],
49344991
'B' : Series(list('aaabbe')) }).set_index('B')
4935-
assert_frame_equal(result, expected)
4992+
assert_frame_equal(result, expected, check_index_type=True)
49364993

49374994
result = self.df2.reindex(['a','b'])
49384995
expected = DataFrame({'A' : [0,1,5,2,3],
49394996
'B' : Series(list('aaabb')) }).set_index('B')
4940-
assert_frame_equal(result, expected)
4997+
assert_frame_equal(result, expected, check_index_type=True)
49414998

49424999
result = self.df2.reindex(['e'])
49435000
expected = DataFrame({'A' : [np.nan],
49445001
'B' : Series(['e']) }).set_index('B')
4945-
assert_frame_equal(result, expected)
5002+
assert_frame_equal(result, expected, check_index_type=True)
49465003

49475004
# give back the type of categorical that we received
49485005
result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats,ordered=True))
49495006
expected = DataFrame({'A' : [0,1,5,np.nan],
49505007
'B' : Series(list('aaad')).astype('category',categories=cats,ordered=True) }).set_index('B')
4951-
assert_frame_equal(result, expected)
5008+
assert_frame_equal(result, expected, check_index_type=True)
49525009

49535010
result = self.df2.reindex(pd.Categorical(['a','d'],categories=['a','d']))
49545011
expected = DataFrame({'A' : [0,1,5,np.nan],
49555012
'B' : Series(list('aaad')).astype('category',categories=['a','d']) }).set_index('B')
4956-
assert_frame_equal(result, expected)
5013+
assert_frame_equal(result, expected, check_index_type=True)
49575014

49585015
# passed duplicate indexers are not allowed
49595016
self.assertRaises(ValueError, lambda : self.df2.reindex(['a','a']))

0 commit comments

Comments
 (0)