From 9b2d05f3ae1e529e9fe6c90d5021c7cdaf4e34fe Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.augspurger88@gmail.com>
Date: Sat, 19 Nov 2016 20:24:30 -0600
Subject: [PATCH 1/3] API: CategoricalType for specifying categoricals

Simple implementation for now.
---
 pandas/core/api.py                 |  2 +-
 pandas/core/categorical.py         | 45 ++++++++++++++++++++++++++++++
 pandas/core/internals.py           |  6 ++++
 pandas/tests/series/test_dtypes.py |  8 +++++-
 4 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/pandas/core/api.py b/pandas/core/api.py
index b5e1de2063c7e..56f648c46fbdc 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -6,7 +6,7 @@
 
 from pandas.core.algorithms import factorize, match, unique, value_counts
 from pandas.types.missing import isnull, notnull
-from pandas.core.categorical import Categorical
+from pandas.core.categorical import Categorical, CategoricalType
 from pandas.core.groupby import Grouper
 from pandas.formats.format import set_eng_float_format
 from pandas.core.index import (Index, CategoricalIndex, Int64Index,
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index fd1a23a5bab7f..516cb16192922 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -2066,3 +2066,48 @@ def _factorize_from_iterables(iterables):
         # For consistency, it should return a list of 2 lists.
         return [[], []]
     return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
+
+
+class CategoricalType(CategoricalDtype):
+    """
+    Type for categorical data with the categories and orderedness,
+    but not the values
+
+    Parameters
+    ----------
+    categories : list or None
+    ordered : bool, default False
+
+    Notes
+    -----
+    `categories=None` implies infer in whatever operation you're
+    doing.
+
+    Examples
+    --------
+    >>> t = CategoricalType(categories=['b', 'a'], ordered=True)
+    >>> s = Series(['a', 'a', 'b', 'b', 'a'])
+    >>> s.astype(t)
+    0    a
+    1    a
+    2    b
+    3    b
+    4    a
+    dtype: category
+    Categories (2, object): [b < a]
+    """
+    dtype = 'category'
+    name = 'category'
+
+    def __new__(cls, categories=None, ordered=False):
+        self = object.__new__(cls)
+        self.categories = categories
+        self.ordered = ordered
+        # XXX: this is just for the repr, will move to base type
+        self._categorical = Categorical(None, categories=categories,
+                                        ordered=ordered)
+        return self
+
+    def __repr__(self):
+        return "<CategoricalType {}>".format(
+            self._categorical._repr_categories())
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 43beefffd448e..6083c15e51e95 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -470,6 +470,12 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
         # may need to convert to categorical
         # this is only called for non-categoricals
         if self.is_categorical_astype(dtype):
+            kwargs = kwargs.copy()
+            categories = getattr(dtype, 'categories', None)
+            ordered = getattr(dtype, 'ordered', False)
+            # should we raise if CategoricalType and passed in kwargs?
+            kwargs.setdefault('categories', categories)
+            kwargs.setdefault('ordered', ordered)
             return self.make_block(Categorical(self.values, **kwargs))
 
         # astype processing
diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
index 9a406dfa10c35..b3cea6f6c956e 100644
--- a/pandas/tests/series/test_dtypes.py
+++ b/pandas/tests/series/test_dtypes.py
@@ -8,7 +8,7 @@
 from numpy import nan
 import numpy as np
 
-from pandas import Series
+from pandas import Series, CategoricalType, Categorical
 from pandas.tseries.index import Timestamp
 from pandas.tseries.tdi import Timedelta
 
@@ -149,6 +149,12 @@ def test_astype_dict(self):
         self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str})
         self.assertRaises(KeyError, s.astype, {0: str})
 
+    def test_astype_categorical(self):
+        s = Series(['a', 'b', 'a'])
+        result = s.astype(CategoricalType(['a', 'b'], ordered=True))
+        expected = Series(Categorical(['a', 'b', 'a'], ordered=True))
+        assert_series_equal(result, expected)
+
     def test_complexx(self):
         # GH4819
         # complex access for ndarray compat

From 9777fcf03024646371a9b8cebfc5429e479d9e5f Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.augspurger88@gmail.com>
Date: Sun, 20 Nov 2016 06:52:36 -0600
Subject: [PATCH 2/3] reuse CategoricalDtye

---
 doc/source/categorical.rst         | 20 +++++++++-
 pandas/core/api.py                 |  3 +-
 pandas/core/categorical.py         | 45 ---------------------
 pandas/tests/series/test_dtypes.py | 15 ++++++-
 pandas/types/dtypes.py             | 64 ++++++++++++++++++++++++++++--
 5 files changed, 94 insertions(+), 53 deletions(-)

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index 090998570a358..572c2b1dcd8fd 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -96,12 +96,14 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to
     df["B"] = raw_cat
     df
 
-You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``:
+You can also specify differently ordered categories or make the resulting data
+ordered by passing a :class:`CategoricalDtype`:
 
 .. ipython:: python
 
     s = pd.Series(["a","b","c","a"])
-    s_cat = s.astype("category", categories=["b","c","d"], ordered=False)
+    cat_type = pd.CategoricalDtype(categories=["b", "c", "d"], ordered=False)
+    s_cat = s.astype(cat_type)
     s_cat
 
 Categorical data has a specific ``category`` :ref:`dtype <basics.dtypes>`:
@@ -141,6 +143,20 @@ constructor to save the factorize step during normal constructor mode:
     splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
     s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
 
+
+CategoricalDtype
+----------------
+
+A categorical's type is fully described by 1.) its categories (an iterable),
+and 2.) its orderedness (a boolean).
+This information can be stored in a :class:`~pandas.CategoricalDtype` and passed to
+any place pandas expects a `dtype`. For example :func:`pandas.read_csv`,
+:func:`pandas.DataFrame.astype`, the Series constructor, etc.
+
+As a convenience, you can use the string `'category'` in place of a
+:class:`pandas.CategoricalDtype` when you want the default behavior of
+the categories being unordered, and equal to the set values present in the array.
+
 Description
 -----------
 
diff --git a/pandas/core/api.py b/pandas/core/api.py
index 56f648c46fbdc..d12287f6ab5a3 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -5,8 +5,9 @@
 import numpy as np
 
 from pandas.core.algorithms import factorize, match, unique, value_counts
+from pandas.types.dtypes import CategoricalDtype
 from pandas.types.missing import isnull, notnull
-from pandas.core.categorical import Categorical, CategoricalType
+from pandas.core.categorical import Categorical
 from pandas.core.groupby import Grouper
 from pandas.formats.format import set_eng_float_format
 from pandas.core.index import (Index, CategoricalIndex, Int64Index,
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 516cb16192922..fd1a23a5bab7f 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -2066,48 +2066,3 @@ def _factorize_from_iterables(iterables):
         # For consistency, it should return a list of 2 lists.
         return [[], []]
     return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
-
-
-class CategoricalType(CategoricalDtype):
-    """
-    Type for categorical data with the categories and orderedness,
-    but not the values
-
-    Parameters
-    ----------
-    categories : list or None
-    ordered : bool, default False
-
-    Notes
-    -----
-    `categories=None` implies infer in whatever operation you're
-    doing.
-
-    Examples
-    --------
-    >>> t = CategoricalType(categories=['b', 'a'], ordered=True)
-    >>> s = Series(['a', 'a', 'b', 'b', 'a'])
-    >>> s.astype(t)
-    0    a
-    1    a
-    2    b
-    3    b
-    4    a
-    dtype: category
-    Categories (2, object): [b < a]
-    """
-    dtype = 'category'
-    name = 'category'
-
-    def __new__(cls, categories=None, ordered=False):
-        self = object.__new__(cls)
-        self.categories = categories
-        self.ordered = ordered
-        # XXX: this is just for the repr, will move to base type
-        self._categorical = Categorical(None, categories=categories,
-                                        ordered=ordered)
-        return self
-
-    def __repr__(self):
-        return "<CategoricalType {}>".format(
-            self._categorical._repr_categories())
diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
index b3cea6f6c956e..38ec58c974f97 100644
--- a/pandas/tests/series/test_dtypes.py
+++ b/pandas/tests/series/test_dtypes.py
@@ -8,7 +8,7 @@
 from numpy import nan
 import numpy as np
 
-from pandas import Series, CategoricalType, Categorical
+from pandas import Series, CategoricalDtype, Categorical, Index
 from pandas.tseries.index import Timestamp
 from pandas.tseries.tdi import Timedelta
 
@@ -151,10 +151,21 @@ def test_astype_dict(self):
 
     def test_astype_categorical(self):
         s = Series(['a', 'b', 'a'])
-        result = s.astype(CategoricalType(['a', 'b'], ordered=True))
+        result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
         expected = Series(Categorical(['a', 'b', 'a'], ordered=True))
         assert_series_equal(result, expected)
 
+        result = s.astype(CategoricalDtype(['a', 'b'], ordered=False))
+        expected = Series(Categorical(['a', 'b', 'a'], ordered=False))
+        assert_series_equal(result, expected)
+
+        result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False))
+        expected = Series(Categorical(['a', 'b', 'a'],
+                                      categories=['a', 'b', 'c'],
+                                      ordered=False))
+        assert_series_equal(result, expected)
+        tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c']))
+
     def test_complexx(self):
         # GH4819
         # complex access for ndarray compat
diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py
index 5b6d7905d4095..58d0410f45090 100644
--- a/pandas/types/dtypes.py
+++ b/pandas/types/dtypes.py
@@ -1,6 +1,7 @@
 """ define extension dtypes """
 
 import re
+import reprlib
 import numpy as np
 from pandas import compat
 
@@ -98,11 +99,54 @@ class CategoricalDtypeType(type):
 class CategoricalDtype(ExtensionDtype):
 
     """
+    Type for categorical data with the categories and orderedness,
+    but not the values
+
+    .. versionadded:: 0.20.0
+
+    Parameters
+    ----------
+    categories : list or None
+    ordered : bool, default False
+
+    Examples
+    --------
+    >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True)
+    >>> s = Series(['a', 'a', 'b', 'b', 'a'])
+    >>> s.astype(t)
+    0    a
+    1    a
+    2    b
+    3    b
+    4    a
+    dtype: category
+    Categories (2, object): [b < a]
+
+    Notes
+    -----
+    An instance of ``CategoricalDtype`` compares equal with any other
+    instance of ``CategoricalDtype``, regardless of categories or ordered.
+    In addition they compare equal to the string ``'category'``.
+    To check whether two instances of a ``CategoricalDtype`` match,
+    use the ``is`` operator.
+
+    >>> t1 = CategoricalDtype(['a', 'b'], ordered=True)
+    >>> t2 = CategoricalDtype(['a', 'c'], ordered=False)
+    >>> t1 == t2
+    True
+    >>> t1 == 'category'
+    True
+    >>> t1 is t2
+    False
+    >>> t1 is CategoricalDtype(['a', 'b'], ordered=True)
+    True
+
     A np.dtype duck-typed class, suitable for holding a custom categorical
     dtype.
 
     THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object
     """
+    # TODO: Document public vs. private API
     name = 'category'
     type = CategoricalDtypeType
     kind = 'O'
@@ -110,13 +154,18 @@ class CategoricalDtype(ExtensionDtype):
     base = np.dtype('O')
     _cache = {}
 
-    def __new__(cls):
+    def __new__(cls, categories=None, ordered=False):
+        categories_ = categories if categories is None else tuple(categories)
+        t = (categories_, ordered)
 
         try:
-            return cls._cache[cls.name]
+            return cls._cache[t]
         except KeyError:
             c = object.__new__(cls)
-            cls._cache[cls.name] = c
+            c.categories = categories
+            c.ordered = ordered
+
+            cls._cache[t] = c
             return c
 
     def __hash__(self):
@@ -129,6 +178,15 @@ def __eq__(self, other):
 
         return isinstance(other, CategoricalDtype)
 
+    # def __unicode__(self):
+    #     tpl = 'CategoricalDtype({!r}, ordered={})'
+    #     return tpl.format(reprlib.repr(self.categories), self.ordered)
+
+    # def __repr__(self):
+    #     """ return the base repr for the categories """
+    #     tpl = 'CategoricalDtype({!r}, ordered={})'
+    #     return tpl.format(reprlib.repr(self.categories), self.ordered)
+
     @classmethod
     def construct_from_string(cls, string):
         """ attempt to construct this type from a string, raise a TypeError if

From 2a6a0e140d01366fed30fa7562656e0a74a7f983 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.augspurger88@gmail.com>
Date: Sun, 20 Nov 2016 19:58:13 -0600
Subject: [PATCH 3/3] Series ctor

---
 pandas/core/series.py                    |  3 ++-
 pandas/tests/series/test_constructors.py | 18 ++++++++++++++++++
 pandas/types/common.py                   |  5 ++++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/pandas/core/series.py b/pandas/core/series.py
index 105e39562f561..bba7cd4e326a2 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -2837,7 +2837,8 @@ def _try_cast(arr, take_fast_path):
                 subarr = np.array(subarr, dtype=dtype, copy=copy)
         except (ValueError, TypeError):
             if is_categorical_dtype(dtype):
-                subarr = Categorical(arr)
+                subarr = Categorical(arr, dtype.categories,
+                                     ordered=dtype.ordered)
             elif dtype is not None and raise_cast_failure:
                 raise
             else:
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index ed7b0fda19cb7..18c8d8eaaa450 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -162,6 +162,24 @@ def test_constructor_categorical(self):
         self.assertTrue(is_categorical_dtype(s))
         self.assertTrue(is_categorical_dtype(s.dtype))
 
+    def test_constructor_categorical_dtype(self):
+        result = pd.Series(['a', 'b'],
+                           dtype=pd.CategoricalDtype(['a', 'b', 'c'],
+                                                     ordered=True))
+        self.assertTrue(is_categorical_dtype(result))
+        tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c']))
+        self.assertTrue(result.cat.ordered)
+
+        result = pd.Series(['a', 'b'], dtype=pd.CategoricalDtype(['b', 'a']))
+        self.assertTrue(is_categorical_dtype(result))
+        tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
+        self.assertFalse(result.cat.ordered)
+
+        result = pd.Series(['a', 'b', 'c'],
+                           dtype=pd.CategoricalDtype(['a', 'b']))
+        expected = pd.Series(pd.Categorical(['a', 'b', np.nan]))
+        tm.assert_series_equal(result, expected)
+
     def test_constructor_maskedarray(self):
         data = ma.masked_all((3, ), dtype=float)
         result = Series(data)
diff --git a/pandas/types/common.py b/pandas/types/common.py
index 754ff80924c07..50b2bc253e11a 100644
--- a/pandas/types/common.py
+++ b/pandas/types/common.py
@@ -319,7 +319,10 @@ def is_complex_dtype(arr_or_dtype):
 def _coerce_to_dtype(dtype):
     """ coerce a string / np.dtype to a dtype """
     if is_categorical_dtype(dtype):
-        dtype = CategoricalDtype()
+        categories = getattr(dtype, 'categories', None)
+        ordered = getattr(dtype, 'ordered', False)
+        # TODO: pass thru categories, ordered
+        dtype = CategoricalDtype(categories=categories, ordered=ordered)
     elif is_datetime64tz_dtype(dtype):
         dtype = DatetimeTZDtype(dtype)
     elif is_period_dtype(dtype):