diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 51cce713ff930..8f25a5ec5f20a 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -60,7 +60,7 @@ Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Warnings against the obsolete usage ``Categorical(codes, categories)``, which were emitted for instance when the first two arguments to ``Categorical()`` had different dtypes, and recommended the use of ``Categorical.from_codes``, have now been removed (:issue:`8074`) - - diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1e3c8f89c0e05..6d60cf72efd62 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -25,7 +25,6 @@ is_timedelta64_dtype, is_categorical, is_categorical_dtype, - is_integer_dtype, is_list_like, is_sequence, is_scalar, is_dict_like) @@ -261,6 +260,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # c.) infer from values if dtype is not None: + # The dtype argument takes precedence over values.dtype (if any) if isinstance(dtype, compat.string_types): if dtype == 'category': dtype = CategoricalDtype(categories, ordered) @@ -275,9 +275,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, ordered = dtype.ordered elif is_categorical(values): + # If no "dtype" was passed, use the one from "values", but honor + # the "ordered" and "categories" arguments dtype = values.dtype._from_categorical_dtype(values.dtype, categories, ordered) else: + # If dtype=None and values is not categorical, create a new dtype dtype = CategoricalDtype(categories, ordered) # At this point, dtype is always a CategoricalDtype @@ -294,28 +297,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # sanitize input if is_categorical_dtype(values): + if dtype.categories is None: + dtype = CategoricalDtype(values.categories, dtype.ordered) - # we are either a Series or a CategoricalIndex - if isinstance(values, (ABCSeries, ABCCategoricalIndex)): - values = values._values - - if ordered is None: - ordered = values.ordered - if categories is None: - categories = values.categories - values = values.get_values() - - elif isinstance(values, (ABCIndexClass, ABCSeries)): - # we'll do inference later - pass - - else: - - # on numpy < 1.6 datetimelike get inferred to all i8 by - # _sanitize_array which is fine, but since factorize does this - # correctly no need here this is an issue because _sanitize_array - # also coerces np.nan to a string under certain versions of numpy - # as well + elif not isinstance(values, (ABCIndexClass, ABCSeries)): + # _sanitize_array coerces np.nan to a string under certain versions + # of numpy values = maybe_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) @@ -335,7 +322,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, codes, categories = factorize(values, sort=True) except TypeError: codes, categories = factorize(values, sort=False) - if ordered: + if dtype.ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories raise TypeError("'values' is not ordered, please " @@ -347,34 +334,18 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") - if dtype.categories is None: - # we're inferring from values - dtype = CategoricalDtype(categories, ordered) + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) - else: - # there were two ways if categories are present - # - the old one, where each value is a int pointer to the levels - # array -> not anymore possible, but code outside of pandas could - # call us like that, so make some checks - # - the new one, where each value is also in the categories array - # (or np.nan) + elif is_categorical_dtype(values): + old_codes = (values.cat.codes if isinstance(values, ABCSeries) + else values.codes) + codes = _recode_for_categories(old_codes, values.dtype.categories, + dtype.categories) + else: codes = _get_codes_for_values(values, dtype.categories) - # TODO: check for old style usage. These warnings should be removes - # after 0.18/ in 2016 - if (is_integer_dtype(values) and - not is_integer_dtype(dtype.categories)): - warn("Values and categories have different dtypes. Did you " - "mean to use\n'Categorical.from_codes(codes, " - "categories)'?", RuntimeWarning, stacklevel=2) - - if (len(values) and is_integer_dtype(values) and - (codes == -1).all()): - warn("None of the categories were found in values. Did you " - "mean to use\n'Categorical.from_codes(codes, " - "categories)'?", RuntimeWarning, stacklevel=2) - if null_mask.any(): # Reinsert -1 placeholders for previously removed missing values full_codes = - np.ones(null_mask.shape, dtype=codes.dtype) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index b77e2d1dcda8a..f062da02b2493 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -306,20 +306,18 @@ def f(): assert len(cat.codes) == 1 assert cat.codes[0] == 0 - # Catch old style constructor useage: two arrays, codes + categories - # We can only catch two cases: + # two arrays # - when the first is an integer dtype and the second is not # - when the resulting codes are all -1/NaN - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(None): c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) # noqa - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(None): c_old = Categorical([0, 1, 2, 0, 1, 2], # noqa categories=[3, 4, 5]) - # the next one are from the old docs, but unfortunately these don't - # trigger :-( + # the next one are from the old docs with tm.assert_produces_warning(None): c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) # noqa cat = Categorical([1, 2], categories=[1, 2, 3])