From a6cb4b371c11921d6b75157c15aa17a6b869f9ba Mon Sep 17 00:00:00 2001 From: William Wagner Date: Wed, 17 Aug 2016 22:09:50 -0400 Subject: [PATCH] ERR: Categoricals shouldn't allow non-strings when object dtype is passed (#13919) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/categorical.py | 16 +++++++++++++++- pandas/tests/test_categorical.py | 30 ++++++++++++++++++++++++++---- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2412b645221ab..bd8541432061e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1071,3 +1071,4 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. +- Bug in ``Categorical`` would allow creation when ``object`` dtype was passed in with non-string values \ No newline at end of file diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6ea0a5e96672d..cc4084c118062 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -18,9 +18,12 @@ is_dtype_equal, is_datetimelike, is_categorical_dtype, + is_string_like, is_integer_dtype, is_bool, is_list_like, is_sequence, - is_scalar) + is_scalar, + is_object_dtype, + is_period_arraylike) from pandas.core.common import is_null_slice from pandas.core.algorithms import factorize, take_1d @@ -191,6 +194,8 @@ class Categorical(PandasObject): If an explicit ``ordered=True`` is given but no `categories` and the `values` are not sortable. + If an `object` dtype is passed and `values` contains dtypes other + than all strings or all periods. Examples -------- @@ -231,6 +236,15 @@ class Categorical(PandasObject): def __init__(self, values, categories=None, ordered=False, name=None, fastpath=False): + # categoricals w/ object dtype shouldn't allow non-strings + if is_object_dtype(values): + values = _convert_to_list_like(values) + # for now, periods are an exception + if not all(is_string_like(v) for v in values + if notnull(v)) and not is_period_arraylike(values): + raise TypeError("Categoricals cannot be object dtype unless" + " all values are strings or all are periods.") + if fastpath: # fast path self._codes = _coerce_indexer_dtype(values, categories) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index b630e0914259e..9abab170e7c55 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -94,13 +94,35 @@ def test_constructor_unsortable(self): # it works! arr = np.array([1, 2, 3, datetime.now()], dtype='O') - factor = Categorical.from_array(arr, ordered=False) - self.assertFalse(factor.ordered) + msg = "Categoricals cannot be object dtype unless all values are " \ + "strings or all are periods." + with tm.assertRaisesRegexp(TypeError, msg): + factor = Categorical.from_array(arr, ordered=False) # this however will raise as cannot be sorted self.assertRaises( TypeError, lambda: Categorical.from_array(arr, ordered=True)) + def test_constructor_object_dtype(self): + #GH 13919 + + #categories must be of single dtype + arr = np.array([1, 2, 3, 's'], dtype=object) + msg = "Categoricals cannot be object dtype unless all values are " \ + "strings or all are periods." + with tm.assertRaisesRegexp(TypeError, msg): + c = Categorical.from_array(arr) + + # object dtype allowed when all strs + exp_arr = np.array(list('abcd'), dtype=object) + c = Categorical.from_array(exp_arr) + tm.assert_numpy_array_equal(c.__array__(), exp_arr) + + # object dtype also allowed when all periods + idx = pd.period_range('1/1/2000', freq='D', periods=5) + c = Categorical(idx) + tm.assert_index_equal(c.categories, idx) + def test_is_equal_dtype(self): # test dtype comparisons between cats @@ -4255,7 +4277,6 @@ def test_str_accessor_api_for_categorical(self): ('endswith', ("a",), {}), ('extract', ("([a-z]*) ",), {"expand":False}), ('extract', ("([a-z]*) ",), {"expand":True}), - ('extractall', ("([a-z]*) ",), {}), ('find', ("a",), {}), ('findall', ("a",), {}), ('index', (" ",), {}), @@ -4286,7 +4307,8 @@ def test_str_accessor_api_for_categorical(self): # we can't make a categorical with lists as individual categories. # -> `s.str.split(" ").astype("category")` will error! # * `translate` has different interfaces for py2 vs. py3 - _ignore_names = ["get", "join", "translate"] + # extractall creates Categorical w/ object dtype and int, which raises + _ignore_names = ["get", "join", "translate", "extractall"] str_func_names = [f for f in dir(s.str)