Skip to content

REGR: Bug in indexing with a CategoricalIndex #16123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1630,7 +1630,8 @@ Indexing
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
- Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`)
- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`)
- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`)
- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`)
- Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`)

I/O
^^^
Expand Down
21 changes: 19 additions & 2 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
is_list_like,
is_interval_dtype,
is_scalar)
from pandas.core.common import _asarray_tuplesafe
from pandas.core.common import (_asarray_tuplesafe,
_values_from_object)
from pandas.core.dtypes.missing import array_equivalent
from pandas.core.algorithms import take_1d

Expand Down Expand Up @@ -353,6 +354,22 @@ def get_loc(self, key, method=None):
raise KeyError(key)
return self._engine.get_loc(codes)

def get_value(self, series, key):
"""
Fast lookup of value from 1-dimensional ndarray. Only use this if you
know what you're doing
"""
try:
k = _values_from_object(key)
k = self._convert_scalar_indexer(k, kind='getitem')
indexer = self.get_loc(k)
return series.iloc[indexer]
except (KeyError, TypeError):
pass

# we might be a positional inexer
return super(CategoricalIndex, self).get_value(series, key)

def _can_reindex(self, indexer):
""" always allow reindexing """
pass
Expand Down Expand Up @@ -507,7 +524,7 @@ def _convert_list_indexer(self, keyarr, kind=None):
indexer = self.categories._convert_list_indexer(keyarr, kind=kind)
return Index(self.codes).get_indexer_for(indexer)

indexer = self.categories.get_indexer(keyarr)
indexer = self.categories.get_indexer(np.asarray(keyarr))
if (indexer == -1).any():
raise KeyError(
"a list-indexer must only "
Expand Down
137 changes: 86 additions & 51 deletions pandas/tests/indexing/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from pandas import (Series, DataFrame, Timestamp,
Categorical, CategoricalIndex)
from pandas.util.testing import assert_series_equal, assert_frame_equal
from pandas.util import testing as tm

Expand Down Expand Up @@ -66,6 +67,17 @@ def f():

pytest.raises(TypeError, f)

def test_getitem_scalar(self):

cats = Categorical([Timestamp('12-31-1999'),
Timestamp('12-31-2000')])

s = Series([1, 2], index=cats)

expected = s.iloc[0]
result = s[cats[0]]
assert result == expected

def test_loc_listlike(self):

# list of labels
Expand All @@ -74,7 +86,7 @@ def test_loc_listlike(self):
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.loc[['a', 'b', 'e']]
exp_index = pd.CategoricalIndex(
exp_index = CategoricalIndex(
list('aaabbe'), categories=list('cabe'), name='B')
expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
assert_frame_equal(result, expected, check_index_type=True)
Expand All @@ -86,14 +98,14 @@ def test_loc_listlike(self):
df = self.df2.copy()
df.loc['e'] = 20
result = df.loc[['a', 'b', 'e']]
exp_index = pd.CategoricalIndex(
exp_index = CategoricalIndex(
list('aaabbe'), categories=list('cabe'), name='B')
expected = DataFrame({'A': [0, 1, 5, 2, 3, 20]}, index=exp_index)
assert_frame_equal(result, expected)

df = self.df2.copy()
result = df.loc[['a', 'b', 'e']]
exp_index = pd.CategoricalIndex(
exp_index = CategoricalIndex(
list('aaabbe'), categories=list('cabe'), name='B')
expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
assert_frame_equal(result, expected, check_index_type=True)
Expand All @@ -105,21 +117,21 @@ def test_loc_listlike_dtypes(self):
# GH 11586

# unique categories and codes
index = pd.CategoricalIndex(['a', 'b', 'c'])
index = CategoricalIndex(['a', 'b', 'c'])
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)

# unique slice
res = df.loc[['a', 'b']]
exp_index = pd.CategoricalIndex(['a', 'b'],
categories=index.categories)
exp_index = CategoricalIndex(['a', 'b'],
categories=index.categories)
exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index)
tm.assert_frame_equal(res, exp, check_index_type=True)

# duplicated slice
res = df.loc[['a', 'a', 'b']]

exp_index = pd.CategoricalIndex(['a', 'a', 'b'],
categories=index.categories)
exp_index = CategoricalIndex(['a', 'a', 'b'],
categories=index.categories)
exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index)
tm.assert_frame_equal(res, exp, check_index_type=True)

Expand All @@ -130,22 +142,22 @@ def test_loc_listlike_dtypes(self):
df.loc[['a', 'x']]

# duplicated categories and codes
index = pd.CategoricalIndex(['a', 'b', 'a'])
index = CategoricalIndex(['a', 'b', 'a'])
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)

# unique slice
res = df.loc[['a', 'b']]
exp = DataFrame({'A': [1, 3, 2],
'B': [4, 6, 5]},
index=pd.CategoricalIndex(['a', 'a', 'b']))
index=CategoricalIndex(['a', 'a', 'b']))
tm.assert_frame_equal(res, exp, check_index_type=True)

# duplicated slice
res = df.loc[['a', 'a', 'b']]
exp = DataFrame(
{'A': [1, 3, 1, 3, 2],
'B': [4, 6, 4, 6, 5
]}, index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
]}, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
tm.assert_frame_equal(res, exp, check_index_type=True)

with tm.assertRaisesRegexp(
Expand All @@ -155,27 +167,27 @@ def test_loc_listlike_dtypes(self):
df.loc[['a', 'x']]

# contains unused category
index = pd.CategoricalIndex(
index = CategoricalIndex(
['a', 'b', 'a', 'c'], categories=list('abcde'))
df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index)

res = df.loc[['a', 'b']]
exp = DataFrame({'A': [1, 3, 2],
'B': [5, 7, 6]}, index=pd.CategoricalIndex(
['a', 'a', 'b'], categories=list('abcde')))
exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]},
index=CategoricalIndex(['a', 'a', 'b'],
categories=list('abcde')))
tm.assert_frame_equal(res, exp, check_index_type=True)

res = df.loc[['a', 'e']]
exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]},
index=pd.CategoricalIndex(['a', 'a', 'e'],
categories=list('abcde')))
index=CategoricalIndex(['a', 'a', 'e'],
categories=list('abcde')))
tm.assert_frame_equal(res, exp, check_index_type=True)

# duplicated slice
res = df.loc[['a', 'a', 'b']]
exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]},
index=pd.CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
categories=list('abcde')))
index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
categories=list('abcde')))
tm.assert_frame_equal(res, exp, check_index_type=True)

with tm.assertRaisesRegexp(
Expand All @@ -184,54 +196,77 @@ def test_loc_listlike_dtypes(self):
'that are in the categories'):
df.loc[['a', 'x']]

def test_get_indexer_array(self):
arr = np.array([Timestamp('1999-12-31 00:00:00'),
Timestamp('2000-12-31 00:00:00')], dtype=object)
cats = [Timestamp('1999-12-31 00:00:00'),
Timestamp('2000-12-31 00:00:00')]
ci = CategoricalIndex(cats,
categories=cats,
ordered=False, dtype='category')
result = ci.get_indexer(arr)
expected = np.array([0, 1], dtype='intp')
tm.assert_numpy_array_equal(result, expected)

def test_getitem_with_listlike(self):
# GH 16115
cats = Categorical([Timestamp('12-31-1999'),
Timestamp('12-31-2000')])

expected = DataFrame([[1, 0], [0, 1]], dtype='uint8',
index=[0, 1], columns=cats)
dummies = pd.get_dummies(cats)
result = dummies[[c for c in dummies.columns]]
assert_frame_equal(result, expected)

def test_ix_categorical_index(self):
# GH 12531
df = pd.DataFrame(np.random.randn(3, 3),
index=list('ABC'), columns=list('XYZ'))
df = DataFrame(np.random.randn(3, 3),
index=list('ABC'), columns=list('XYZ'))
cdf = df.copy()
cdf.index = pd.CategoricalIndex(df.index)
cdf.columns = pd.CategoricalIndex(df.columns)
cdf.index = CategoricalIndex(df.index)
cdf.columns = CategoricalIndex(df.columns)

expect = pd.Series(df.loc['A', :], index=cdf.columns, name='A')
expect = Series(df.loc['A', :], index=cdf.columns, name='A')
assert_series_equal(cdf.loc['A', :], expect)

expect = pd.Series(df.loc[:, 'X'], index=cdf.index, name='X')
expect = Series(df.loc[:, 'X'], index=cdf.index, name='X')
assert_series_equal(cdf.loc[:, 'X'], expect)

exp_index = pd.CategoricalIndex(list('AB'), categories=['A', 'B', 'C'])
expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
index=exp_index)
exp_index = CategoricalIndex(list('AB'), categories=['A', 'B', 'C'])
expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
index=exp_index)
assert_frame_equal(cdf.loc[['A', 'B'], :], expect)

exp_columns = pd.CategoricalIndex(list('XY'),
categories=['X', 'Y', 'Z'])
expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
columns=exp_columns)
exp_columns = CategoricalIndex(list('XY'),
categories=['X', 'Y', 'Z'])
expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
columns=exp_columns)
assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect)

# non-unique
df = pd.DataFrame(np.random.randn(3, 3),
index=list('ABA'), columns=list('XYX'))
df = DataFrame(np.random.randn(3, 3),
index=list('ABA'), columns=list('XYX'))
cdf = df.copy()
cdf.index = pd.CategoricalIndex(df.index)
cdf.columns = pd.CategoricalIndex(df.columns)
cdf.index = CategoricalIndex(df.index)
cdf.columns = CategoricalIndex(df.columns)

exp_index = pd.CategoricalIndex(list('AA'), categories=['A', 'B'])
expect = pd.DataFrame(df.loc['A', :], columns=cdf.columns,
index=exp_index)
exp_index = CategoricalIndex(list('AA'), categories=['A', 'B'])
expect = DataFrame(df.loc['A', :], columns=cdf.columns,
index=exp_index)
assert_frame_equal(cdf.loc['A', :], expect)

exp_columns = pd.CategoricalIndex(list('XX'), categories=['X', 'Y'])
expect = pd.DataFrame(df.loc[:, 'X'], index=cdf.index,
columns=exp_columns)
exp_columns = CategoricalIndex(list('XX'), categories=['X', 'Y'])
expect = DataFrame(df.loc[:, 'X'], index=cdf.index,
columns=exp_columns)
assert_frame_equal(cdf.loc[:, 'X'], expect)

expect = pd.DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
index=pd.CategoricalIndex(list('AAB')))
expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns,
index=CategoricalIndex(list('AAB')))
assert_frame_equal(cdf.loc[['A', 'B'], :], expect)

expect = pd.DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
columns=pd.CategoricalIndex(list('XXY')))
expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index,
columns=CategoricalIndex(list('XXY')))
assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect)

def test_read_only_source(self):
Expand Down Expand Up @@ -281,13 +316,13 @@ def test_reindexing(self):
# then return a Categorical
cats = list('cabe')

result = self.df2.reindex(pd.Categorical(['a', 'd'], categories=cats))
result = self.df2.reindex(Categorical(['a', 'd'], categories=cats))
expected = DataFrame({'A': [0, 1, 5, np.nan],
'B': Series(list('aaad')).astype(
'category', categories=cats)}).set_index('B')
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(pd.Categorical(['a'], categories=cats))
result = self.df2.reindex(Categorical(['a'], categories=cats))
expected = DataFrame({'A': [0, 1, 5],
'B': Series(list('aaa')).astype(
'category', categories=cats)}).set_index('B')
Expand All @@ -309,15 +344,15 @@ def test_reindexing(self):
assert_frame_equal(result, expected, check_index_type=True)

# give back the type of categorical that we received
result = self.df2.reindex(pd.Categorical(
result = self.df2.reindex(Categorical(
['a', 'd'], categories=cats, ordered=True))
expected = DataFrame(
{'A': [0, 1, 5, np.nan],
'B': Series(list('aaad')).astype('category', categories=cats,
ordered=True)}).set_index('B')
assert_frame_equal(result, expected, check_index_type=True)

result = self.df2.reindex(pd.Categorical(
result = self.df2.reindex(Categorical(
['a', 'd'], categories=['a', 'd']))
expected = DataFrame({'A': [0, 1, 5, np.nan],
'B': Series(list('aaad')).astype(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,8 +490,8 @@ def test_dataframe_dummies_with_categorical(self):
'cat_x', 'cat_y']]
assert_frame_equal(result, expected)

# GH12402 Add a new parameter `drop_first` to avoid collinearity
def test_basic_drop_first(self):
# GH12402 Add a new parameter `drop_first` to avoid collinearity
# Basic case
s_list = list('abc')
s_series = Series(s_list)
Expand Down