Skip to content

BUG: Categoricals shouldn't allow non-strings when object dtype is passed (#13919) #14027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1071,3 +1071,4 @@ Bug Fixes
- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
- Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`)
- Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment.
- Bug in ``Categorical`` would allow creation when ``object`` dtype was passed in with non-string values
16 changes: 15 additions & 1 deletion pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@
is_dtype_equal,
is_datetimelike,
is_categorical_dtype,
is_string_like,
is_integer_dtype, is_bool,
is_list_like, is_sequence,
is_scalar)
is_scalar,
is_object_dtype,
is_period_arraylike)
from pandas.core.common import is_null_slice

from pandas.core.algorithms import factorize, take_1d
Expand Down Expand Up @@ -191,6 +194,8 @@ class Categorical(PandasObject):
If an explicit ``ordered=True`` is given but no `categories` and the
`values` are not sortable.

If an `object` dtype is passed and `values` contains dtypes other
than all strings or all periods.

Examples
--------
Expand Down Expand Up @@ -231,6 +236,15 @@ class Categorical(PandasObject):
def __init__(self, values, categories=None, ordered=False,
name=None, fastpath=False):

# categoricals w/ object dtype shouldn't allow non-strings
if is_object_dtype(values):
values = _convert_to_list_like(values)
# for now, periods are an exception
if not all(is_string_like(v) for v in values
if notnull(v)) and not is_period_arraylike(values):
raise TypeError("Categoricals cannot be object dtype unless"
" all values are strings or all are periods.")

if fastpath:
# fast path
self._codes = _coerce_indexer_dtype(values, categories)
Expand Down
30 changes: 26 additions & 4 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,35 @@ def test_constructor_unsortable(self):

# it works!
arr = np.array([1, 2, 3, datetime.now()], dtype='O')
factor = Categorical.from_array(arr, ordered=False)
self.assertFalse(factor.ordered)
msg = "Categoricals cannot be object dtype unless all values are " \
"strings or all are periods."
with tm.assertRaisesRegexp(TypeError, msg):
factor = Categorical.from_array(arr, ordered=False)

# this however will raise as cannot be sorted
self.assertRaises(
TypeError, lambda: Categorical.from_array(arr, ordered=True))

def test_constructor_object_dtype(self):
#GH 13919

#categories must be of single dtype
arr = np.array([1, 2, 3, 's'], dtype=object)
msg = "Categoricals cannot be object dtype unless all values are " \
"strings or all are periods."
with tm.assertRaisesRegexp(TypeError, msg):
c = Categorical.from_array(arr)

# object dtype allowed when all strs
exp_arr = np.array(list('abcd'), dtype=object)
c = Categorical.from_array(exp_arr)
tm.assert_numpy_array_equal(c.__array__(), exp_arr)

# object dtype also allowed when all periods
idx = pd.period_range('1/1/2000', freq='D', periods=5)
c = Categorical(idx)
tm.assert_index_equal(c.categories, idx)

def test_is_equal_dtype(self):

# test dtype comparisons between cats
Expand Down Expand Up @@ -4255,7 +4277,6 @@ def test_str_accessor_api_for_categorical(self):
('endswith', ("a",), {}),
('extract', ("([a-z]*) ",), {"expand":False}),
('extract', ("([a-z]*) ",), {"expand":True}),
('extractall', ("([a-z]*) ",), {}),
('find', ("a",), {}),
('findall', ("a",), {}),
('index', (" ",), {}),
Expand Down Expand Up @@ -4286,7 +4307,8 @@ def test_str_accessor_api_for_categorical(self):
# we can't make a categorical with lists as individual categories.
# -> `s.str.split(" ").astype("category")` will error!
# * `translate` has different interfaces for py2 vs. py3
_ignore_names = ["get", "join", "translate"]
# extractall creates Categorical w/ object dtype and int, which raises
_ignore_names = ["get", "join", "translate", "extractall"]

str_func_names = [f
for f in dir(s.str)
Expand Down