diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 3bda8c7eacb61..799d04859cc2a 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -638,9 +638,11 @@ and allows efficient indexing and storage of an index with a large number of dup .. ipython:: python + from pandas.api.types import CategoricalDtype + df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) - df['B'] = df['B'].astype('category', categories=list('cab')) + df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) df df.dtypes df.B.cat.categories diff --git a/doc/source/api.rst b/doc/source/api.rst index 6b3e6bedcb24b..b822b7943f1d6 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -646,7 +646,10 @@ strings and apply several methods to it. These can be accessed like Categorical ~~~~~~~~~~~ -If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical +.. autoclass:: api.types.CategoricalDtype + :members: categories, ordered + +If the Series is of dtype ``CategoricalDtype``, ``Series.cat`` can be used to change the categorical data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the following usable methods and properties: diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 65361886436d6..d2b53420764b2 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -89,12 +89,22 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: +Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of + +1. categories are inferred from the data +2. categories are unordered. + +To control those behaviors, instead of passing ``'category'``, use an instance +of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python - s = pd.Series(["a","b","c","a"]) - s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + from pandas.api.types import CategoricalDtype + + s = pd.Series(["a", "b", "c", "a"]) + cat_type = CategoricalDtype(categories=["b", "c", "d"], + ordered=True) + s_cat = s.astype(cat_type) s_cat Categorical data has a specific ``category`` :ref:`dtype `: @@ -133,6 +143,75 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +.. _categorical.categoricaldtype: + +CategoricalDtype +---------------- + +.. versionchanged:: 0.21.0 + +A categorical's type is fully described by + +1. ``categories``: a sequence of unique values and no missing values +2. ``ordered``: a boolean + +This information can be stored in a :class:`~pandas.api.types.CategoricalDtype`. +The ``categories`` argument is optional, which implies that the actual categories +should be inferred from whatever is present in the data when the +:class:`pandas.Categorical` is created. The categories are assumed to be unordered +by default. + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + CategoricalDtype(['a', 'b', 'c']) + CategoricalDtype(['a', 'b', 'c'], ordered=True) + CategoricalDtype() + +A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas +expects a `dtype`. For example :func:`pandas.read_csv`, +:func:`pandas.DataFrame.astype`, or in the Series constructor. + +.. note:: + + As a convenience, you can use the string ``'category'`` in place of a + :class:`~pandas.api.types.CategoricalDtype` when you want the default behavior of + the categories being unordered, and equal to the set values present in the + array. In other words, ``dtype='category'`` is equivalent to + ``dtype=CategoricalDtype()``. + +Equality Semantics +~~~~~~~~~~~~~~~~~~ + +Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal +whenever they have the same categories and orderedness. When comparing two +unordered categoricals, the order of the ``categories`` is not considered + +.. ipython:: python + + c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False) + + # Equal, since order is not considered when ordered=False + c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False) + + # Unequal, since the second CategoricalDtype is ordered + c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) + +All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` + +.. ipython:: python + + c1 == 'category' + +.. warning:: + + Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``, + and since all instances ``CategoricalDtype`` compare equal to ``'category'``, + all instances of ``CategoricalDtype`` compare equal to a + ``CategoricalDtype(None, False)``, regardless of ``categories`` or + ``ordered``. + Description ----------- @@ -184,7 +263,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype('category', categories=list('abcd')) + s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd'))) s # categories @@ -297,7 +376,9 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype( + CategoricalDtype(ordered=True) + ) s.sort_values(inplace=True) s s.min(), s.max() @@ -397,9 +478,15 @@ categories or a categorical with any list-like object, will raise a TypeError. .. ipython:: python - cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype( + CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base = pd.Series([2,2,2]).astype( + CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base2 = pd.Series([2,2,2]).astype( + CategoricalDtype(ordered=True) + ) cat cat_base diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 72787ea97a782..ad40c75a62722 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -830,8 +830,10 @@ The left frame. .. ipython:: python + from pandas.api.types import CategoricalDtype + X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype('category', categories=['foo', 'bar']) + X = X.astype(CategoricalDtype(categories=['foo', 'bar'])) left = pd.DataFrame({'X': X, 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) @@ -842,8 +844,11 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), - 'Z': [1, 2]}) + right = pd.DataFrame({ + 'X': pd.Series(['foo', 'bar'], + dtype=CategoricalDtype(['foo', 'bar'])), + 'Z': [1, 2] + }) right right.dtypes diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5a353544a4283..82b793f7f84c3 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -10,6 +10,8 @@ users upgrade to this version. Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. +- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying + categoricals independent of the data, see :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -89,6 +91,31 @@ This does not raise any obvious exceptions, but also does not create a new colum Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. +.. _whatsnew_0210.enhancements.categorical_dtype: + +``CategoricalDtype`` for specifying categoricals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`pandas.api.types.CategoricalDtype` has been added to the public API and +expanded to include the ``categories`` and ``ordered`` attributes. A +``CategoricalDtype`` can be used to specify the set of categories and +orderedness of an array, independent of the data themselves. This can be useful, +e.g., when converting string data to a ``Categorical`` (:issue:`14711`, +:issue:`15078`, :issue:`16015`): + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + s = pd.Series(['a', 'b', 'c', 'a']) # strings + dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) + s.astype(dtype) + +The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a +``Series`` with categorical type will now return an instance of ``CategoricalDtype``. + +See the :ref:`CategoricalDtype docs ` for more. + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index ddca93f07ad5e..48180f5e3217a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -23,7 +23,7 @@ is_datetimelike, is_categorical, is_categorical_dtype, - is_integer_dtype, is_bool, + is_integer_dtype, is_list_like, is_sequence, is_scalar) from pandas.core.common import is_null_slice, _maybe_box_datetimelike @@ -139,33 +139,6 @@ def maybe_to_categorical(array): setter to change values in the categorical. """ -_categories_doc = """The categories of this categorical. - -Setting assigns new values to each category (effectively a rename of -each individual category). - -The assigned value has to be a list-like object. All items must be unique and -the number of items in the new categories must be the same as the number of -items in the old categories. - -Assigning to `categories` is a inplace operation! - -Raises ------- -ValueError - If the new categories do not validate as categories or if the number of new - categories is unequal the number of old categories - -See also --------- -rename_categories -reorder_categories -add_categories -remove_categories -remove_unused_categories -set_categories -""" - class Categorical(PandasObject): """ @@ -192,6 +165,10 @@ class Categorical(PandasObject): ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will not be ordered. + dtype : CategoricalDtype + An instance of ``CategoricalDtype`` to use for this categorical + + .. versionadded:: 0.21.0 Attributes ---------- @@ -202,6 +179,11 @@ class Categorical(PandasObject): categorical, read only. ordered : boolean Whether or not this Categorical is ordered. + dtype : CategoricalDtype + The instance of ``CategoricalDtype`` storing the ``categories`` + and ``ordered``. + + .. versionadded:: 0.21.0 Raises ------ @@ -211,7 +193,6 @@ class Categorical(PandasObject): If an explicit ``ordered=True`` is given but no `categories` and the `values` are not sortable. - Examples -------- >>> from pandas import Categorical @@ -223,17 +204,17 @@ class Categorical(PandasObject): [a, b, c, a, b, c] Categories (3, object): [a < b < c] + Only ordered `Categoricals` can be sorted (according to the order + of the categories) and have a min and max value. + >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a'], ordered=True) >>> a.min() 'c' - """ - dtype = CategoricalDtype() - """The dtype (always "category")""" - """Whether or not this Categorical is ordered. - Only ordered `Categoricals` can be sorted (according to the order - of the categories) and have a min and max value. + Notes + ----- + See the :ref:`user guide ` for more. See also -------- @@ -241,23 +222,58 @@ class Categorical(PandasObject): Categorical.order Categorical.min Categorical.max + pandas.api.types.CategoricalDtype """ # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 + _dtype = CategoricalDtype() _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, fastpath=False): + def __init__(self, values, categories=None, ordered=None, dtype=None, + fastpath=False): + + # Ways of specifying the dtype (prioritized ordered) + # 1. dtype is a CategoricalDtype + # a.) with known categories, use dtype.categories + # b.) else with Categorical values, use values.dtype + # c.) else, infer from values + # d.) specifying dtype=CategoricalDtype and categories is an error + # 2. dtype is a string 'category' + # a.) use categories, ordered + # b.) use values.dtype + # c.) infer from values + # 3. dtype is None + # a.) use categories, ordered + # b.) use values.dtype + # c.) infer from values + + if dtype is not None: + if isinstance(dtype, compat.string_types): + if dtype == 'category': + dtype = CategoricalDtype(categories, ordered) + else: + raise ValueError("Unknown `dtype` {}".format(dtype)) + elif categories is not None or ordered is not None: + raise ValueError("Cannot specify both `dtype` and `categories`" + " or `ordered`.") + + categories = dtype.categories + ordered = dtype.ordered + + elif is_categorical(values): + dtype = values.dtype._from_categorical_dtype(values.dtype, + categories, ordered) + else: + dtype = CategoricalDtype(categories, ordered) - self._validate_ordered(ordered) + # At this point, dtype is always a CategoricalDtype + # if dtype.categories is None, we are inferring if fastpath: - # fast path self._codes = coerce_indexer_dtype(values, categories) - self._categories = self._validate_categories( - categories, fastpath=isinstance(categories, ABCIndexClass)) - self._ordered = ordered + self._dtype = dtype return # sanitize input @@ -274,6 +290,7 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): values = values.get_values() elif isinstance(values, (ABCIndexClass, ABCSeries)): + # we'll do inference later pass else: @@ -291,12 +308,12 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # "object" dtype to prevent this. In the end objects will be # casted to int/... in the category assignment step. if len(values) == 0 or isna(values).any(): - dtype = 'object' + sanitize_dtype = 'object' else: - dtype = None - values = _sanitize_array(values, None, dtype=dtype) + sanitize_dtype = None + values = _sanitize_array(values, None, dtype=sanitize_dtype) - if categories is None: + if dtype.categories is None: try: codes, categories = factorize(values, sort=True) except TypeError: @@ -313,7 +330,9 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") - categories = self._validate_categories(categories) + if dtype.categories is None: + # we're inferring from values + dtype = CategoricalDtype(categories, ordered) else: # there were two ways if categories are present @@ -323,14 +342,12 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # - the new one, where each value is also in the categories array # (or np.nan) - # make sure that we always have the same type here, no matter what - # we get passed in - categories = self._validate_categories(categories) - codes = _get_codes_for_values(values, categories) + codes = _get_codes_for_values(values, dtype.categories) # TODO: check for old style usage. These warnings should be removes # after 0.18/ in 2016 - if is_integer_dtype(values) and not is_integer_dtype(categories): + if (is_integer_dtype(values) and + not is_integer_dtype(dtype.categories)): warn("Values and categories have different dtypes. Did you " "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) @@ -341,9 +358,57 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) - self.set_ordered(ordered or False, inplace=True) - self._categories = categories - self._codes = coerce_indexer_dtype(codes, categories) + self._dtype = dtype + self._codes = coerce_indexer_dtype(codes, dtype.categories) + + @property + def categories(self): + """The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (self.dtype.categories is not None and + len(self.dtype.categories) != len(new_dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items as the old categories!") + self._dtype = new_dtype + + @property + def ordered(self): + """Whether the categories have an ordered relationship""" + return self.dtype.ordered + + @property + def dtype(self): + """The :ref:`~pandas.api.types.CategoricalDtype` for this instance""" + return self._dtype def __dir__(self): # Avoid IPython warnings for deprecated properties @@ -492,7 +557,7 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = cls._validate_categories(categories) + categories = CategoricalDtype._validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -535,94 +600,38 @@ def _get_labels(self): labels = property(fget=_get_labels, fset=_set_codes) - _categories = None - - @classmethod - def _validate_ordered(cls, ordered): - """ - Validates that we have a valid ordered parameter. If - it is not a boolean, a TypeError will be raised. - - Parameters - ---------- - ordered : object - The parameter to be verified. - - Raises - ------ - TypeError - If 'ordered' is not a boolean. - """ - if not is_bool(ordered): - raise TypeError("'ordered' must either be 'True' or 'False'") - - @classmethod - def _validate_categories(cls, categories, fastpath=False): - """ - Validates that we have good categories - - Parameters - ---------- - fastpath : boolean (default: False) - Don't perform validation of the categories for uniqueness or nulls - - """ - if not isinstance(categories, ABCIndexClass): - dtype = None - if not hasattr(categories, "dtype"): - if not is_list_like(categories): - raise TypeError("`categories` must be list-like. " - "Got {} instead".format(repr(categories))) - categories = _convert_to_list_like(categories) - # On categories with NaNs, int values would be converted to - # float. Use "object" dtype to prevent this. - if isna(categories).any(): - without_na = np.array([x for x in categories - if notna(x)]) - with_na = np.array(categories) - if with_na.dtype != without_na.dtype: - dtype = "object" - - from pandas import Index - categories = Index(categories, dtype=dtype) - - if not fastpath: - - # Categories cannot contain NaN. - if categories.hasnans: - raise ValueError('Categorial categories cannot be null') - - # Categories must be unique. - if not categories.is_unique: - raise ValueError('Categorical categories must be unique') - - return categories - def _set_categories(self, categories, fastpath=False): - """ Sets new categories + """ Sets new categories inplace Parameters ---------- fastpath : boolean (default: False) Don't perform validation of the categories for uniqueness or nulls + Examples + -------- + >>> c = Categorical(['a', 'b']) + >>> c + [a, b] + Categories (2, object): [a, b] + + >>> c._set_categories(pd.Index(['a', 'c'])) + >>> c + [a, c] + Categories (2, object): [a, c] """ - categories = self._validate_categories(categories, fastpath=fastpath) - if (not fastpath and self._categories is not None and - len(categories) != len(self._categories)): + if fastpath: + new_dtype = CategoricalDtype._from_fastpath(categories, + self.ordered) + else: + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (not fastpath and self.dtype.categories is not None and + len(new_dtype.categories) != len(self.dtype.categories)): raise ValueError("new categories need to have the same number of " "items than the old categories!") - self._categories = categories - - def _get_categories(self): - """ Gets the categories """ - # categories is an Index, which is immutable -> no need to copy - return self._categories - - categories = property(fget=_get_categories, fset=_set_categories, - doc=_categories_doc) + self._dtype = new_dtype def _codes_for_groupby(self, sort): """ @@ -664,7 +673,21 @@ def _codes_for_groupby(self, sort): return self.reorder_categories(cat.categories) - _ordered = None + def _set_dtype(self, dtype): + """Internal method for directly updating the CategoricalDtype + + Parameters + ---------- + dtype : CategoricalDtype + + Notes + ----- + We don't do any validation here. It's assumed that the dtype is + a (valid) instance of `CategoricalDtype`. + """ + codes = _recode_for_categories(self.codes, self.categories, + dtype.categories) + return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): """ @@ -679,9 +702,9 @@ def set_ordered(self, value, inplace=False): of this categorical with ordered set to the value """ inplace = validate_bool_kwarg(inplace, 'inplace') - self._validate_ordered(value) + new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() - cat._ordered = value + cat._dtype = new_dtype if not inplace: return cat @@ -711,12 +734,6 @@ def as_unordered(self, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') return self.set_ordered(False, inplace=inplace) - def _get_ordered(self): - """ Gets the ordered attribute """ - return self._ordered - - ordered = property(fget=_get_ordered) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Sets the categories to the specified new_categories. @@ -769,22 +786,21 @@ def set_categories(self, new_categories, ordered=None, rename=False, remove_unused_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - new_categories = self._validate_categories(new_categories) + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + cat = self if inplace else self.copy() if rename: - if (cat._categories is not None and - len(new_categories) < len(cat._categories)): + if (cat.dtype.categories is not None and + len(new_dtype.categories) < len(cat.dtype.categories)): # remove all _codes which are larger and set to -1/NaN - self._codes[self._codes >= len(new_categories)] = -1 + self._codes[self._codes >= len(new_dtype.categories)] = -1 else: codes = _recode_for_categories(self.codes, self.categories, - new_categories) + new_dtype.categories) cat._codes = codes - cat._categories = new_categories - - if ordered is None: - ordered = self.ordered - cat.set_ordered(ordered, inplace=True) + cat._dtype = new_dtype if not inplace: return cat @@ -864,7 +880,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - if set(self._categories) != set(new_categories): + if set(self.dtype.categories) != set(new_categories): raise ValueError("items in new_categories are not the same as in " "old categories") return self.set_categories(new_categories, ordered=ordered, @@ -905,15 +921,17 @@ def add_categories(self, new_categories, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if not is_list_like(new_categories): new_categories = [new_categories] - already_included = set(new_categories) & set(self._categories) + already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: msg = ("new categories must not include old categories: %s" % str(already_included)) raise ValueError(msg) - new_categories = list(self._categories) + list(new_categories) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) + cat = self if inplace else self.copy() - cat._categories = self._validate_categories(new_categories) - cat._codes = coerce_indexer_dtype(cat._codes, new_categories) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) if not inplace: return cat @@ -953,8 +971,9 @@ def remove_categories(self, removals, inplace=False): removals = [removals] removal_set = set(list(removals)) - not_included = removal_set - set(self._categories) - new_categories = [c for c in self._categories if c not in removal_set] + not_included = removal_set - set(self.dtype.categories) + new_categories = [c for c in self.dtype.categories + if c not in removal_set] # GH 10156 if any(isna(removals)): @@ -996,8 +1015,11 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._categories = cat.categories.take(idx) - cat._codes = coerce_indexer_dtype(inv, self._categories) + new_categories = cat.dtype.categories.take(idx) + new_dtype = CategoricalDtype._from_fastpath(new_categories, + ordered=self.ordered) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) if not inplace: return cat @@ -1098,7 +1120,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self._validate_categories(state.pop( + state['_categories'] = self.dtype._validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( @@ -1113,6 +1135,11 @@ def __setstate__(self, state): else: state['_ordered'] = False + # 0.21.0 CategoricalDtype change + if '_dtype' not in state: + state['_dtype'] = CategoricalDtype(state['_categories'], + state['_ordered']) + for k, v in compat.iteritems(state): setattr(self, k, v) @@ -1122,7 +1149,7 @@ def T(self): @property def nbytes(self): - return self._codes.nbytes + self._categories.values.nbytes + return self._codes.nbytes + self.dtype.categories.values.nbytes def memory_usage(self, deep=False): """ @@ -1147,7 +1174,8 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self._categories.memory_usage(deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage( + deep=deep) @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) @@ -1278,7 +1306,7 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, categories=cat, ordered=obj.ordered, + ix = self._constructor(ix, dtype=self.dtype, fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') @@ -1991,8 +2019,7 @@ def is_dtype_equal(self, other): """ try: - return (self.categories.equals(other.categories) and - self.ordered == other.ordered) + return hash(self.dtype) == hash(other.dtype) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c47e61dc446be..f60c0d5ffdca0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -692,6 +692,40 @@ def is_dtype_equal(source, target): return False +def is_dtype_union_equal(source, target): + """ + Check whether two arrays have compatible dtypes to do a union. + numpy types are checked with ``is_dtype_equal``. Extension types are + checked separately. + + Parameters + ---------- + source : The first dtype to compare + target : The second dtype to compare + + Returns + ---------- + boolean : Whether or not the two dtypes are equal. + + >>> is_dtype_equal("int", int) + True + + >>> is_dtype_equal(CategoricalDtype(['a', 'b'], + ... CategoricalDtype(['b', 'c'])) + True + + >>> is_dtype_equal(CategoricalDtype(['a', 'b'], + ... CategoricalDtype(['b', 'c'], ordered=True)) + False + """ + source = _get_dtype(source) + target = _get_dtype(target) + if is_categorical_dtype(source) and is_categorical_dtype(target): + # ordered False for both + return source.ordered is target.ordered + return is_dtype_equal(source, target) + + def is_any_int_dtype(arr_or_dtype): """ DEPRECATED: This function will be removed in a future version. @@ -1671,7 +1705,9 @@ def _coerce_to_dtype(dtype): """ if is_categorical_dtype(dtype): - dtype = CategoricalDtype() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + dtype = CategoricalDtype(categories=categories, ordered=ordered) elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index dc2c56ea476f9..d2487905caced 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,6 +3,7 @@ import re import numpy as np from pandas import compat +from pandas.core.dtypes.generic import ABCIndexClass class ExtensionDtype(object): @@ -110,37 +111,161 @@ class CategoricalDtypeType(type): class CategoricalDtype(ExtensionDtype): """ - A np.dtype duck-typed class, suitable for holding a custom categorical - dtype. - - THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object + Type for categorical data with the categories and orderedness + + .. versionchanged:: 0.21.0 + + Parameters + ---------- + categories : sequence, optional + Must be unique, and must not contain any nulls. + ordered : bool, default False + + Notes + ----- + This class is useful for specifying the type of a ``Categorical`` + independent of the values. See :ref:`categorical.categoricaldtype` + for more. + + Examples + -------- + >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + 0 a + 1 b + 2 a + 3 NaN + dtype: category + Categories (2, object): [b < a] + + See Also + -------- + Categorical """ + # TODO: Document public vs. private API name = 'category' type = CategoricalDtypeType kind = 'O' str = '|O08' base = np.dtype('O') - _metadata = [] + _metadata = ['categories', 'ordered'] _cache = {} - def __new__(cls): + def __init__(self, categories=None, ordered=False): + self._finalize(categories, ordered, fastpath=False) - try: - return cls._cache[cls.name] - except KeyError: - c = object.__new__(cls) - cls._cache[cls.name] = c - return c + @classmethod + def _from_fastpath(cls, categories=None, ordered=False): + self = cls.__new__(cls) + self._finalize(categories, ordered, fastpath=True) + return self + + @classmethod + def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): + if categories is ordered is None: + return dtype + if categories is None: + categories = dtype.categories + if ordered is None: + ordered = dtype.ordered + return cls(categories, ordered) + + def _finalize(self, categories, ordered, fastpath=False): + from pandas.core.indexes.base import Index + + if ordered is None: + ordered = False + + if categories is not None: + categories = Index(categories, tupleize_cols=False) + # validation + self._validate_categories(categories) + self._validate_ordered(ordered) + self._categories = categories + self._ordered = ordered + + def __setstate__(self, state): + self._categories = state.pop('categories', None) + self._ordered = state.pop('ordered', False) def __hash__(self): - # make myself hashable - return hash(str(self)) + # _hash_categories returns a uint64, so use the negative + # space for when we have unknown categories to avoid a conflict + if self.categories is None: + if self.ordered: + return -1 + else: + return -2 + # We *do* want to include the real self.ordered here + return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, CategoricalDtype) + if not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + return False + elif self.categories is None or other.categories is None: + # We're forced into a suboptimal corner thanks to math and + # backwards compatibility. We require that `CDT(...) == 'category'` + # for all CDTs **including** `CDT(None, ...)`. Therefore, *all* + # CDT(., .) = CDT(None, False) and *all* + # CDT(., .) = CDT(None, True). + return True + elif self.ordered: + return other.ordered and self.categories.equals(other.categories) + elif other.ordered: + return False + else: + # both unordered; this could probably be optimized / cached + return hash(self) == hash(other) + + def __unicode__(self): + tpl = u'CategoricalDtype(categories={}ordered={})' + if self.categories is None: + data = u"None, " + else: + data = self.categories._format_data(name=self.__class__.__name__) + return tpl.format(data, self.ordered) + + @staticmethod + def _hash_categories(categories, ordered=True): + from pandas.core.util.hashing import ( + hash_array, _combine_hash_arrays, hash_tuples + ) + + if len(categories) and isinstance(categories[0], tuple): + # assumes if any individual category is a tuple, then all our. ATM + # I don't really want to support just some of the categories being + # tuples. + categories = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(categories) + else: + if categories.dtype == 'O': + types = [type(x) for x in categories] + if not len(set(types)) == 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + cat_array = np.array([hash(x) for x in categories]) + hashed = hash((tuple(categories), ordered)) + return hashed + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: + cat_array = np.vstack([ + cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) + ]) + else: + cat_array = [cat_array] + hashed = _combine_hash_arrays(iter(cat_array), + num_items=len(cat_array)) + if len(hashed) == 0: + # bug in Numpy<1.12 for length 0 arrays. Just return the correct + # value of 0 + return 0 + else: + return np.bitwise_xor.reduce(hashed) @classmethod def construct_from_string(cls, string): @@ -154,6 +279,68 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") + @staticmethod + def _validate_ordered(ordered): + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + from pandas.core.dtypes.common import is_bool + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + + @staticmethod + def _validate_categories(categories, fastpath=False): + """ + Validates that we have good categories + + Parameters + ---------- + categories : array-like + fastpath : bool + Whether to skip nan and uniqueness checks + + Returns + ------- + categories : Index + """ + from pandas import Index + + if not isinstance(categories, ABCIndexClass): + categories = Index(categories) + + if not fastpath: + + if categories.hasnans: + raise ValueError('Categorial categories cannot be null') + + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + + return categories + + @property + def categories(self): + """ + An ``Index`` containing the unique categories allowed. + """ + return self._categories + + @property + def ordered(self): + """Whether the categories have an ordered relationship""" + return self._ordered + class DatetimeTZDtypeType(type): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 008828cf4f309..3ce7403f8d726 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,6 +27,7 @@ is_integer, is_float, is_dtype_equal, + is_dtype_union_equal, is_object_dtype, is_categorical_dtype, is_interval_dtype, @@ -847,7 +848,7 @@ def _formatter_func(self): """ return default_pprint - def _format_data(self): + def _format_data(self, name=None): """ Return the formatted data as a unicode string """ @@ -856,9 +857,11 @@ def _format_data(self): display_width, _ = get_console_size() if display_width is None: display_width = get_option('display.width') or 80 + if name is None: + name = self.__class__.__name__ - space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2)) + space1 = "\n%s" % (' ' * (len(name) + 1)) + space2 = "\n%s" % (' ' * (len(name) + 2)) n = len(self) sep = ',' @@ -2170,7 +2173,11 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not is_dtype_equal(self.dtype, other.dtype): + # TODO: is_dtype_union_equal is a hack around + # 1. buggy set ops with duplicates (GH #13432) + # 2. CategoricalIndex lacking setops (GH #10186) + # Once those are fixed, this workaround can be removed + if not is_dtype_union_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ef1dc4d971f37..5464bc10b18e5 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -58,16 +58,18 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): if fastpath: - return cls._simple_new(data, name=name) + return cls._simple_new(data, name=name, dtype=dtype) if name is None and hasattr(data, 'name'): name = data.name if isinstance(data, ABCCategorical): - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) elif isinstance(data, CategoricalIndex): data = data._data - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) else: # don't allow scalars @@ -114,7 +116,8 @@ def _create_from_codes(self, codes, categories=None, ordered=None, return CategoricalIndex(cat, name=name) @staticmethod - def _create_categorical(self, data, categories=None, ordered=None): + def _create_categorical(self, data, categories=None, ordered=None, + dtype=None): """ *this is an internal non-public method* @@ -125,6 +128,7 @@ def _create_categorical(self, data, categories=None, ordered=None): data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing + dtype : CategoricalDtype, defaults to existing Returns ------- @@ -135,22 +139,30 @@ def _create_categorical(self, data, categories=None, ordered=None): data = data.values if not isinstance(data, ABCCategorical): - ordered = False if ordered is None else ordered + if ordered is None and dtype is None: + ordered = False from pandas.core.categorical import Categorical - data = Categorical(data, categories=categories, ordered=ordered) + data = Categorical(data, categories=categories, ordered=ordered, + dtype=dtype) else: + from pandas.core.dtypes.dtypes import CategoricalDtype + if categories is not None: - data = data.set_categories(categories) - if ordered is not None: + data = data.set_categories(categories, ordered=ordered) + elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) + if isinstance(dtype, CategoricalDtype): + # we want to silently ignore dtype='category' + data = data._set_dtype(dtype) return data @classmethod def _simple_new(cls, values, name=None, categories=None, ordered=None, - **kwargs): + dtype=None, **kwargs): result = object.__new__(cls) - values = cls._create_categorical(cls, values, categories, ordered) + values = cls._create_categorical(cls, values, categories, ordered, + dtype=dtype) result._data = values result.name = name for k, v in compat.iteritems(kwargs): @@ -161,16 +173,28 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, categories=None, ordered=None, - **kwargs): + dtype=None, **kwargs): # categories and ordered can't be part of attributes, # as these are properties + # we want to reuse self.dtype if possible, i.e. neither are + # overridden. + if dtype is not None and (categories is not None or + ordered is not None): + raise TypeError("Cannot specify both `dtype` and `categories` " + "or `ordered`") + + if categories is None and ordered is None: + dtype = self.dtype if dtype is None else dtype + return super(CategoricalIndex, self)._shallow_copy( + values=values, dtype=dtype, **kwargs) if categories is None: categories = self.categories if ordered is None: ordered = self.ordered - return super(CategoricalIndex, - self)._shallow_copy(values=values, categories=categories, - ordered=ordered, **kwargs) + + return super(CategoricalIndex, self)._shallow_copy( + values=values, categories=categories, + ordered=ordered, **kwargs) def _is_dtype_compat(self, other): """ @@ -236,7 +260,7 @@ def _format_attrs(self): ('ordered', self.ordered)] if self.name is not None: attrs.append(('name', ibase.default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype)) + attrs.append(('dtype', "'%s'" % self.dtype.name)) max_seq_items = get_option('display.max_seq_items') or len(self) if len(self) > max_seq_items: attrs.append(('length', len(self))) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c0a9c139722f5..c36ef020faf31 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -950,9 +950,10 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs): na_rep=na_rep, justify='all').get_result() - def _format_data(self): + def _format_data(self, name=None): # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical n = len(self) max_seq_items = min((get_option( 'display.max_seq_items') or n) // 10, 10) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ea613a27b6521..9de69c9c3e97c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def _format_attrs(self): def _format_space(self): return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b759abaed4e56..81600f1baa842 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -189,7 +189,7 @@ def _format_attrs(self): attrs.append(('name', ibase.default_pprint(self.name))) return attrs - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83b382ec0ed72..e510ca87e44aa 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -139,14 +139,14 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if is_categorical_dtype(dtype): - if dtype == CategoricalDtype(): - return True - + if dtype is Categorical or dtype is CategoricalDtype: # this is a pd.Categorical, but is not # a valid type for astypeing raise TypeError("invalid type {0} for astype".format(dtype)) + elif is_categorical_dtype(dtype): + return True + return False def external_values(self, dtype=None): @@ -548,6 +548,18 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): + if (('categories' in kwargs or 'ordered' in kwargs) and + isinstance(dtype, CategoricalDtype)): + raise TypeError("Cannot specify a CategoricalDtype and also " + "`categories` or `ordered`. Use " + "`dtype=CategoricalDtype(categories, ordered)`" + " instead.") + kwargs = kwargs.copy() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + + kwargs.setdefault('categories', categories) + kwargs.setdefault('ordered', ordered) return self.make_block(Categorical(self.values, **kwargs)) # astype processing diff --git a/pandas/core/series.py b/pandas/core/series.py index ac11c5f908fdc..bc84bd09f0b44 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2987,7 +2987,8 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): - subarr = Categorical(arr) + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 12e8d8aba9177..27252b9616a44 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -2,7 +2,6 @@ import numpy as np from pandas.compat import long, string_types, PY3 -from pandas.core.categorical import Categorical from pandas.core.dtypes.common import ( _ensure_platform_int, _ensure_int64, @@ -183,6 +182,8 @@ def indexer_from_factorized(labels, shape, compress=True): def lexsort_indexer(keys, orders=None, na_position='last'): + from pandas.core.categorical import Categorical + labels = [] shape = [] if isinstance(orders, bool): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 07e993d7ef509..0c82773b75c28 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -260,7 +260,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask - # numpy if categorical is a subdtype of complex, as it will choke. + # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8a36f234484b4..e0be34b14a97d 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -545,10 +545,12 @@ def test_is_complex_dtype(): (pd.Index([1, 2]), np.dtype('int64')), (pd.Index(['a', 'b']), np.dtype(object)), ('category', 'category'), - (pd.Categorical(['a', 'b']).dtype, CategoricalDtype()), - (pd.Categorical(['a', 'b']), CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']), CategoricalDtype()), + (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])), + (CategoricalDtype(), CategoricalDtype()), + (CategoricalDtype(['a', 'b']), CategoricalDtype()), (pd.DatetimeIndex([1, 2]), np.dtype('