Skip to content

Commit a6cb4b3

Browse files
committed
ERR: Categoricals shouldn't allow non-strings when object dtype is passed (pandas-dev#13919)
1 parent 6fa2b03 commit a6cb4b3

File tree

3 files changed

+42
-5
lines changed

3 files changed

+42
-5
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1071,3 +1071,4 @@ Bug Fixes
10711071
- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
10721072
- Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`)
10731073
- Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment.
1074+
- Bug in ``Categorical`` would allow creation when ``object`` dtype was passed in with non-string values

pandas/core/categorical.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@
1818
is_dtype_equal,
1919
is_datetimelike,
2020
is_categorical_dtype,
21+
is_string_like,
2122
is_integer_dtype, is_bool,
2223
is_list_like, is_sequence,
23-
is_scalar)
24+
is_scalar,
25+
is_object_dtype,
26+
is_period_arraylike)
2427
from pandas.core.common import is_null_slice
2528

2629
from pandas.core.algorithms import factorize, take_1d
@@ -191,6 +194,8 @@ class Categorical(PandasObject):
191194
If an explicit ``ordered=True`` is given but no `categories` and the
192195
`values` are not sortable.
193196
197+
If an `object` dtype is passed and `values` contains dtypes other
198+
than all strings or all periods.
194199
195200
Examples
196201
--------
@@ -231,6 +236,15 @@ class Categorical(PandasObject):
231236
def __init__(self, values, categories=None, ordered=False,
232237
name=None, fastpath=False):
233238

239+
# categoricals w/ object dtype shouldn't allow non-strings
240+
if is_object_dtype(values):
241+
values = _convert_to_list_like(values)
242+
# for now, periods are an exception
243+
if not all(is_string_like(v) for v in values
244+
if notnull(v)) and not is_period_arraylike(values):
245+
raise TypeError("Categoricals cannot be object dtype unless"
246+
" all values are strings or all are periods.")
247+
234248
if fastpath:
235249
# fast path
236250
self._codes = _coerce_indexer_dtype(values, categories)

pandas/tests/test_categorical.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,35 @@ def test_constructor_unsortable(self):
9494

9595
# it works!
9696
arr = np.array([1, 2, 3, datetime.now()], dtype='O')
97-
factor = Categorical.from_array(arr, ordered=False)
98-
self.assertFalse(factor.ordered)
97+
msg = "Categoricals cannot be object dtype unless all values are " \
98+
"strings or all are periods."
99+
with tm.assertRaisesRegexp(TypeError, msg):
100+
factor = Categorical.from_array(arr, ordered=False)
99101

100102
# this however will raise as cannot be sorted
101103
self.assertRaises(
102104
TypeError, lambda: Categorical.from_array(arr, ordered=True))
103105

106+
def test_constructor_object_dtype(self):
107+
#GH 13919
108+
109+
#categories must be of single dtype
110+
arr = np.array([1, 2, 3, 's'], dtype=object)
111+
msg = "Categoricals cannot be object dtype unless all values are " \
112+
"strings or all are periods."
113+
with tm.assertRaisesRegexp(TypeError, msg):
114+
c = Categorical.from_array(arr)
115+
116+
# object dtype allowed when all strs
117+
exp_arr = np.array(list('abcd'), dtype=object)
118+
c = Categorical.from_array(exp_arr)
119+
tm.assert_numpy_array_equal(c.__array__(), exp_arr)
120+
121+
# object dtype also allowed when all periods
122+
idx = pd.period_range('1/1/2000', freq='D', periods=5)
123+
c = Categorical(idx)
124+
tm.assert_index_equal(c.categories, idx)
125+
104126
def test_is_equal_dtype(self):
105127

106128
# test dtype comparisons between cats
@@ -4255,7 +4277,6 @@ def test_str_accessor_api_for_categorical(self):
42554277
('endswith', ("a",), {}),
42564278
('extract', ("([a-z]*) ",), {"expand":False}),
42574279
('extract', ("([a-z]*) ",), {"expand":True}),
4258-
('extractall', ("([a-z]*) ",), {}),
42594280
('find', ("a",), {}),
42604281
('findall', ("a",), {}),
42614282
('index', (" ",), {}),
@@ -4286,7 +4307,8 @@ def test_str_accessor_api_for_categorical(self):
42864307
# we can't make a categorical with lists as individual categories.
42874308
# -> `s.str.split(" ").astype("category")` will error!
42884309
# * `translate` has different interfaces for py2 vs. py3
4289-
_ignore_names = ["get", "join", "translate"]
4310+
# extractall creates Categorical w/ object dtype and int, which raises
4311+
_ignore_names = ["get", "join", "translate", "extractall"]
42904312

42914313
str_func_names = [f
42924314
for f in dir(s.str)

0 commit comments

Comments
 (0)