ENH: Added parametrized CategoricalDtype

TomAugspurger · TomAugspurger · commit 2711623ca832 · 2017-04-15T21:50:36.000-05:00
We extended the CategoricalDtype to accept optional `categories` and `ordered`
argument. CategoricalDtype is now part of the public API. This allows users to
specify the desired categories and orderedness of an operation ahead of time.
The current behavior, which is still possible with `categories=None`, the
default, is to infer the categories from whatever is present.

This change will make it easy to implement support for specifying categories
that are know ahead of time in other places e.g. `.astype`, `.read_csv`, and the
`Series` constructor.
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -96,12 +96,14 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to
     df["B"] = raw_cat
     df
 
-You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``:
+You can also specify differently ordered categories or make the resulting data
+ordered by passing a :class:`CategoricalDtype`:
 
 .. ipython:: python
 
     s = pd.Series(["a","b","c","a"])
-    s_cat = s.astype("category", categories=["b","c","d"], ordered=False)
+    cat_type = pd.CategoricalDtype(categories=["b", "c", "d"], ordered=False)
+    s_cat = s.astype(cat_type)
     s_cat
 
 Categorical data has a specific ``category`` :ref:`dtype <basics.dtypes>`:
@@ -140,6 +142,24 @@ constructor to save the factorize step during normal constructor mode:
     splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
     s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
 
+
+CategoricalDtype
+----------------
+
+A categorical's type is fully described by 1.) its categories (an iterable with
+unique values and no missing values), and 2.) its orderedness (a boolean).
+This information can be stored in a :class:`~pandas.CategoricalDtype`.
+The ``categories`` argument is optional, which implies that the actual categories
+should be inferred from whatever is present in the data.
+
+A :class:`~pandas.CategoricalDtype` can be used in any place pandas expects a
+`dtype`. For example :func:`pandas.read_csv`, :func:`pandas.DataFrame.astype`,
+the Series constructor, etc.
+
+As a convenience, you can use the string `'category'` in place of a
+:class:`pandas.CategoricalDtype` when you want the default behavior of
+the categories being unordered, and equal to the set values present in the array.
+
 Description
 -----------
 
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -6,6 +6,7 @@
 
 from pandas.core.algorithms import factorize, unique, value_counts
 from pandas.core.dtypes.missing import isnull, notnull
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.categorical import Categorical
 from pandas.core.groupby import Grouper
 from pandas.formats.format import set_eng_float_format
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -501,7 +501,9 @@ def _coerce_to_dtype(dtype):
     """
 
     if is_categorical_dtype(dtype):
-        dtype = CategoricalDtype()
+        categories = getattr(dtype, 'categories', None)
+        ordered = getattr(dtype, 'ordered', False)
+        dtype = CategoricalDtype(categories=categories, ordered=ordered)
     elif is_datetime64tz_dtype(dtype):
         dtype = DatetimeTZDtype(dtype)
     elif is_period_dtype(dtype):
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -100,26 +100,66 @@ class CategoricalDtypeType(type):
 class CategoricalDtype(ExtensionDtype):
 
     """
-    A np.dtype duck-typed class, suitable for holding a custom categorical
-    dtype.
-
-    THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object
+    Type for categorical data with the categories and orderedness,
+    but not the values.
+
+    .. versionadded:: 0.20.0
+
+    Parameters
+    ----------
+    categories : list or None
+    ordered : bool, default False
+
+    Notes
+    -----
+    An instance of ``CategoricalDtype`` compares equal with any other
+    instance of ``CategoricalDtype``, regardless of categories or ordered.
+    In addition they compare equal to the string ``'category'``.
+
+    To check whether two instances of a ``CategoricalDtype`` exactly,
+    use the ``is`` operator.
+
+    >>> t1 = CategoricalDtype(['a', 'b'], ordered=True)
+    >>> t2 = CategoricalDtype(['a', 'c'], ordered=False)
+    >>> t1 == t2
+    True
+    >>> t1 == 'category'
+    True
+    >>> t1 is t2
+    False
+    >>> t1 is CategoricalDtype(['a', 'b'], ordered=True)
+    True
+
+    Examples
+    --------
+    >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True)
+    >>> s = Series(['a', 'a', 'b', 'b', 'a'])
+    >>> s.astype(t)
+    0    a
+    1    a
+    2    b
+    3    b
+    4    a
+    dtype: category
+    Categories (2, object): [b < a]
     """
+    # TODO: Document public vs. private API
     name = 'category'
     type = CategoricalDtypeType
     kind = 'O'
     str = '|O08'
     base = np.dtype('O')
     _cache = {}
 
-    def __new__(cls):
-
-        try:
-            return cls._cache[cls.name]
-        except KeyError:
-            c = object.__new__(cls)
-            cls._cache[cls.name] = c
-            return c
+    def __new__(cls, categories=None, ordered=False):
+        from pandas.indexes.base import Index
+        if categories is not None:
+            categories = Index(categories)
+            cls._validate_categories(categories)
+            hashed = cls._hash_categories(categories)
+        else:
+            hashed = None
+        return cls._get_or_create(categories, ordered, hashed)
 
     def __hash__(self):
         # make myself hashable
@@ -131,6 +171,33 @@ def __eq__(self, other):
 
         return isinstance(other, CategoricalDtype)
 
+    @staticmethod
+    def _hash_categories(categories):
+        from pandas.tools.hashing import hash_array, _combine_hash_arrays
+        cat_array = np.asarray(categories)
+        hashed = _combine_hash_arrays(
+            iter([hash_array(cat_array)]),
+            num_items=1
+        )
+        hashed = np.bitwise_xor.reduce(hashed)
+        return hashed
+
+    @classmethod
+    def _get_or_create(cls, categories, ordered, hashed):
+
+        try:
+            return cls._cache[(hashed, ordered)]
+        except KeyError:
+            categorical = object.__new__(cls)
+            categorical.categories = categories
+            categorical.ordered = ordered
+            cls._cache[(hashed, ordered)] = categorical
+            return categorical
+
+    def __unicode__(self):
+        tpl = 'CategoricalDtype({!r}, ordered={})'
+        return tpl.format(self.categories, self.ordered)
+
     @classmethod
     def construct_from_string(cls, string):
         """ attempt to construct this type from a string, raise a TypeError if
@@ -143,6 +210,17 @@ def construct_from_string(cls, string):
 
         raise TypeError("cannot construct a CategoricalDtype")
 
+    @staticmethod
+    def _validate_categories(categories):
+        from pandas import isnull
+
+        if not len(categories) == len(set(categories)):
+            raise ValueError("`categories` must be unique.")
+        if isnull(categories).any():
+            raise ValueError("`categories` can not contain any nulls")
+
+        return True
+
 
 class DatetimeTZDtypeType(type):
     """
diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py
@@ -229,7 +229,7 @@ def _format_attrs(self):
             ('ordered', self.ordered)]
         if self.name is not None:
             attrs.append(('name', ibase.default_pprint(self.name)))
-        attrs.append(('dtype', "'%s'" % self.dtype))
+        attrs.append(('dtype', "'%s'" % self.dtype.name))
         max_seq_items = get_option('display.max_seq_items') or len(self)
         if len(self) > max_seq_items:
             attrs.append(('length', len(self)))
diff --git a/pandas/tests/core/dtypes/test_dtypes.py b/pandas/tests/core/dtypes/test_dtypes.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from itertools import product
 
+import pytest
 import numpy as np
 import pandas as pd
 from pandas import Series, Categorical, IntervalIndex, date_range
@@ -356,6 +357,70 @@ def test_not_string(self):
         self.assertFalse(is_string_dtype(PeriodDtype('D')))
 
 
+class TestCategoricalDtypeParametrized(object):
+
+    @pytest.mark.parametrize('categories, ordered', [
+        (['a', 'b', 'c', 'd'], False),
+        (['a', 'b', 'c', 'd'], True),
+        (np.arange(1000), False),
+        (np.arange(1000), True),
+        (['a', 'b', 10, 2, 1.3, True], False),
+        ([True, False], True),
+        ([True, False], False),
+        (pd.date_range('2017', periods=4), True),
+        (pd.date_range('2017', periods=4), False),
+    ])
+    def test_basic(self, categories, ordered):
+        c1 = CategoricalDtype(categories, ordered=ordered)
+        tm.assert_index_equal(c1.categories, pd.Index(categories))
+        assert c1.ordered is ordered
+
+    @pytest.mark.parametrize('ordered', [True, False])
+    def test_is_singleton(self, ordered):
+        c1 = CategoricalDtype(['a', 'b', 'c'], ordered=ordered)
+        c2 = CategoricalDtype(['a', 'b', 'c'], ordered=ordered)
+        assert c1 is c2
+
+    def test_order_matters(self):
+        categories = ['a', 'b']
+        c1 = CategoricalDtype(categories, ordered=False)
+        c2 = CategoricalDtype(categories, ordered=True)
+        assert c1 is not c2
+
+    def test_unordered_same(self):
+        c1 = CategoricalDtype(['a', 'b'])
+        c2 = CategoricalDtype(['b', 'a'])
+        assert c1 is c2
+        tm.assert_index_equal(c1.categories, c2.categories)
+
+    def test_categories(self):
+        result = CategoricalDtype(['a', 'b', 'c'])
+        tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
+        assert result.ordered is False
+
+    def test_equal_but_different(self):
+        c1 = CategoricalDtype([1, 2, 3])
+        c2 = CategoricalDtype([1., 2., 3.])
+        assert c1 is not c2
+
+    @pytest.mark.parametrize('v1, v2', [
+        ([1, 2, 3], [1, 2, 3]),
+        ([1, 2, 3], [3, 2, 1]),
+    ])
+    def test_order_hashes_different(self, v1, v2):
+        c1 = CategoricalDtype(v1)
+        c2 = CategoricalDtype(v2, ordered=True)
+        assert c1 is not c2
+
+    def test_nan_invalid(self):
+        with pytest.raises(ValueError):
+            pd.CategoricalDtype([1, 2, np.nan])
+
+    def test_non_unique_invalid(self):
+        with pytest.raises(ValueError):
+            pd.CategoricalDtype([1, 2, 1])
+
+
 class TestIntervalDtype(Base, tm.TestCase):
 
     # TODO: placeholder