From 790cd42ab29e683120fc4a2e26f440fa4c6e99d7 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 24 Aug 2017 16:04:02 -0500
Subject: [PATCH 1/7] ENH: Parametrized CategoricalDtype

We extended the CategoricalDtype to accept optional categories and ordered
argument.

```python
pd.CategoricalDtype(categories=['a', 'b'], ordered=True
```

CategoricalDtype is now part of the public API. This allows users to
specify the desired categories and orderedness of an operation ahead of time.
The current behavior, which is still possible with categories=None, the
default, is to infer the categories from whatever is present.

This change will make it easy to implement support for specifying categories
that are know ahead of time in other places e.g. .astype, .read_csv, and the
Series constructor.

Closes #14711
Closes #15078
Closes #14676
---
 doc/source/advanced.rst                       |   4 +-
 doc/source/api.rst                            |   5 +-
 doc/source/categorical.rst                    | 101 +++++-
 doc/source/merging.rst                        |  11 +-
 doc/source/whatsnew/v0.21.0.txt               |  26 ++
 pandas/core/categorical.py                    | 317 +++++++++---------
 pandas/core/dtypes/common.py                  |  38 ++-
 pandas/core/dtypes/dtypes.py                  | 204 ++++++++++-
 pandas/core/indexes/base.py                   |  15 +-
 pandas/core/indexes/category.py               |  54 ++-
 pandas/core/indexes/interval.py               |   3 +-
 pandas/core/indexes/multi.py                  |   2 +-
 pandas/core/indexes/range.py                  |   2 +-
 pandas/core/internals.py                      |  20 +-
 pandas/core/series.py                         |   3 +-
 pandas/core/sorting.py                        |   3 +-
 pandas/core/util/hashing.py                   |   2 +-
 pandas/tests/dtypes/test_common.py            |  10 +-
 pandas/tests/dtypes/test_dtypes.py            | 111 +++++-
 pandas/tests/frame/test_analytics.py          |   3 +
 pandas/tests/indexes/test_category.py         |  10 +-
 .../tests/io/json/test_json_table_schema.py   |   5 +-
 pandas/tests/io/test_parquet.py               |   4 +
 pandas/tests/io/test_pytables.py              |  10 +-
 pandas/tests/reshape/test_merge.py            |   4 +-
 pandas/tests/series/test_analytics.py         |  11 +-
 pandas/tests/series/test_constructors.py      |  21 ++
 pandas/tests/series/test_dtypes.py            |  34 +-
 pandas/tests/test_algos.py                    |  72 ++--
 pandas/tests/test_categorical.py              | 141 +++++++-
 pandas/util/testing.py                        |   7 +-
 31 files changed, 970 insertions(+), 283 deletions(-)

diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst
index 3bda8c7eacb61..799d04859cc2a 100644
--- a/doc/source/advanced.rst
+++ b/doc/source/advanced.rst
@@ -638,9 +638,11 @@ and allows efficient indexing and storage of an index with a large number of dup
 
 .. ipython:: python
 
+   from pandas.api.types import CategoricalDtype
+
    df = pd.DataFrame({'A': np.arange(6),
                       'B': list('aabbca')})
-   df['B'] = df['B'].astype('category', categories=list('cab'))
+   df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
    df
    df.dtypes
    df.B.cat.categories
diff --git a/doc/source/api.rst b/doc/source/api.rst
index 6b3e6bedcb24b..b822b7943f1d6 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -646,7 +646,10 @@ strings and apply several methods to it. These can be accessed like
 Categorical
 ~~~~~~~~~~~
 
-If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical
+.. autoclass:: api.types.CategoricalDtype
+   :members: categories, ordered
+
+If the Series is of dtype ``CategoricalDtype``, ``Series.cat`` can be used to change the categorical
 data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the
 following usable methods and properties:
 
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index 65361886436d6..55b5d93e94943 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -89,12 +89,22 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to
     df["B"] = raw_cat
     df
 
-You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``:
+Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of
+
+1. categories are inferred from the data
+2. categories are unordered.
+
+To control those behaviors, instead of passing ``'category'``, use an instance
+of :class:`~pandas.api.types.CategoricalDtype`.
 
 .. ipython:: python
 
-    s = pd.Series(["a","b","c","a"])
-    s_cat = s.astype("category", categories=["b","c","d"], ordered=False)
+    from pandas.api.types import CategoricalDtype
+
+    s = pd.Series(["a", "b", "c", "a"])
+    cat_type = CategoricalDtype(categories=["b", "c", "d"],
+                                ordered=True)
+    s_cat = s.astype(cat_type)
     s_cat
 
 Categorical data has a specific ``category`` :ref:`dtype <basics.dtypes>`:
@@ -133,6 +143,73 @@ constructor to save the factorize step during normal constructor mode:
     splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
     s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
 
+.. _categorical.categoricaldtype:
+
+CategoricalDtype
+----------------
+
+.. versionchanged:: 0.21.0
+
+A categorical's type is fully described by
+
+1. ``categories``: a sequence of unique values and no missing values
+2. ``ordered``: a boolean
+
+This information can be stored in a :class:`~pandas.api.types.CategoricalDtype`.
+The ``categories`` argument is optional, which implies that the actual categories
+should be inferred from whatever is present in the data when the
+:class:`pandas.Categorical` is created. The categories are assumed to be unordered
+by default.      
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   CategoricalDtype(['a', 'b', 'c'])
+   CategoricalDtype(['a', 'b', 'c'], ordered=True)
+   CategoricalDtype()
+
+A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas
+expects a `dtype`. For example :func:`pandas.read_csv`,
+:func:`pandas.DataFrame.astype`, or in the Series constructor.
+
+.. note::
+
+    As a convenience, you can use the string ``'category'`` in place of a
+    :class:`~pandas.api.types.CategoricalDtype` when you want the default behavior of
+    the categories being unordered, and equal to the set values present in the
+    array. In other words, ``dtype='category'`` is equivalent to
+    ``dtype=CategoricalDtype()``.
+
+Equality Semantics
+~~~~~~~~~~~~~~~~~~
+
+Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal
+whenever they have the same categories and orderedness. When comparing two
+unordered categoricals, the order of the ``categories`` is not considered
+
+.. ipython:: python
+
+   c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False)
+
+   # Equal, since order is not considered when ordered=False
+   c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False)
+
+   # Unequal, since the second CategoricalDtype is ordered
+   c1 == CategoricalDtype(['a',  'b', 'c'], ordered=True)
+
+All instances of ``CategoricalDtype`` compare equal to the string ``'category'``
+
+.. ipython:: python
+
+   c1 == 'category'
+
+.. warning::
+
+   Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``,
+   and since all instances ``CategoricalDtype`` compare equal to ``'`category'``,
+   all instances of ``CategoricalDtype`` compare equal to a ``CategoricalDtype(None)``
+
 Description
 -----------
 
@@ -184,7 +261,7 @@ It's also possible to pass in the categories in a specific order:
 
     .. ipython:: python
 
-         s = pd.Series(list('babc')).astype('category', categories=list('abcd'))
+         s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd')))
          s
 
          # categories
@@ -297,7 +374,9 @@ meaning and certain operations are possible. If the categorical is unordered, ``
 
     s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False))
     s.sort_values(inplace=True)
-    s = pd.Series(["a","b","c","a"]).astype('category', ordered=True)
+    s = pd.Series(["a","b","c","a"]).astype(
+        CategoricalDtype(ordered=True)
+    )
     s.sort_values(inplace=True)
     s
     s.min(), s.max()
@@ -397,9 +476,15 @@ categories or a categorical with any list-like object, will raise a TypeError.
 
 .. ipython:: python
 
-    cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True)
-    cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True)
-    cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True)
+    cat = pd.Series([1,2,3]).astype(
+        CategoricalDtype([3, 2, 1], ordered=True)
+    )
+    cat_base = pd.Series([2,2,2]).astype(
+        CategoricalDtype([3, 2, 1], ordered=True)
+    )
+    cat_base2 = pd.Series([2,2,2]).astype(
+        CategoricalDtype(ordered=True)
+    )
 
     cat
     cat_base
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
index 72787ea97a782..ad40c75a62722 100644
--- a/doc/source/merging.rst
+++ b/doc/source/merging.rst
@@ -830,8 +830,10 @@ The left frame.
 
 .. ipython:: python
 
+   from pandas.api.types import CategoricalDtype
+
    X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,)))
-   X = X.astype('category', categories=['foo', 'bar'])
+   X = X.astype(CategoricalDtype(categories=['foo', 'bar']))
 
    left = pd.DataFrame({'X': X,
                         'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
@@ -842,8 +844,11 @@ The right frame.
 
 .. ipython:: python
 
-   right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']),
-                         'Z': [1, 2]})
+   right = pd.DataFrame({
+        'X': pd.Series(['foo', 'bar'],
+                       dtype=CategoricalDtype(['foo', 'bar'])),
+        'Z': [1, 2]
+   })
    right
    right.dtypes
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 5a353544a4283..e73572c296eac 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -10,6 +10,8 @@ users upgrade to this version.
 Highlights include:
 
 - Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
+- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying
+  categoricals independent of the data, see :ref:`here <whatsnew_0210.enhancements.categorical_dtype>`.
 
 Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations <whatsnew_0210.deprecations>` before updating.
 
@@ -89,6 +91,30 @@ This does not raise any obvious exceptions, but also does not create a new colum
 
 Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access <indexing.attribute_access>`.
 
+.. _whatsnew_0210.enhancements.categorical_dtype:
+
+``CategoricalDtype`` for specifying categoricals
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`pandas.api.types.CategoricalDtype` has been added to the public API and
+expanded to include the ``categories`` and ``ordered`` attributes. A
+``CategoricalDtype`` can be used to specify the set of categories and
+orderedness of an array, independent of the data themselves. This can be useful,
+e.g., when converting string data to a ``Categorical`` (:issue:`14711`, :issue:`15078`):
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   s = pd.Series(['a', 'b', 'c', 'a'])  # strings
+   dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True)
+   s.astype(dtype)
+
+The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a
+``Series`` with categorical type will now return an instance of ``CategoricalDtype``.
+
+See the :ref:`CategoricalDtype docs <categorical.categoricaldtype>` for more.
+
 .. _whatsnew_0210.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index ddca93f07ad5e..7e92255ef0419 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -23,7 +23,7 @@
     is_datetimelike,
     is_categorical,
     is_categorical_dtype,
-    is_integer_dtype, is_bool,
+    is_integer_dtype,
     is_list_like, is_sequence,
     is_scalar)
 from pandas.core.common import is_null_slice, _maybe_box_datetimelike
@@ -139,33 +139,6 @@ def maybe_to_categorical(array):
 setter to change values in the categorical.
 """
 
-_categories_doc = """The categories of this categorical.
-
-Setting assigns new values to each category (effectively a rename of
-each individual category).
-
-The assigned value has to be a list-like object. All items must be unique and
-the number of items in the new categories must be the same as the number of
-items in the old categories.
-
-Assigning to `categories` is a inplace operation!
-
-Raises
-------
-ValueError
-    If the new categories do not validate as categories or if the number of new
-    categories is unequal the number of old categories
-
-See also
---------
-rename_categories
-reorder_categories
-add_categories
-remove_categories
-remove_unused_categories
-set_categories
-"""
-
 
 class Categorical(PandasObject):
     """
@@ -192,6 +165,10 @@ class Categorical(PandasObject):
     ordered : boolean, (default False)
         Whether or not this categorical is treated as a ordered categorical.
         If not given, the resulting categorical will not be ordered.
+    dtype : CategoricalDtype
+        An instance of ``CategoricalDtype`` to use for this categorical
+
+        .. versionadded:: 0.21.0
 
     Attributes
     ----------
@@ -202,6 +179,11 @@ class Categorical(PandasObject):
         categorical, read only.
     ordered : boolean
         Whether or not this Categorical is ordered.
+    dtype : CategoricalDtype
+        The instance of ``CategoricalDtype`` storing the ``categories``
+        and ``ordered``.
+
+        .. versionadded:: 0.21.0
 
     Raises
     ------
@@ -211,7 +193,6 @@ class Categorical(PandasObject):
         If an explicit ``ordered=True`` is given but no `categories` and the
         `values` are not sortable.
 
-
     Examples
     --------
     >>> from pandas import Categorical
@@ -223,17 +204,17 @@ class Categorical(PandasObject):
     [a, b, c, a, b, c]
     Categories (3, object): [a < b < c]
 
+    Only ordered `Categoricals` can be sorted (according to the order
+    of the categories) and have a min and max value.
+
     >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a'],
                         ordered=True)
     >>> a.min()
     'c'
-    """
-    dtype = CategoricalDtype()
-    """The dtype (always "category")"""
-    """Whether or not this Categorical is ordered.
 
-    Only ordered `Categoricals` can be sorted (according to the order
-    of the categories) and have a min and max value.
+    Notes
+    -----
+    See the :ref:`user guide <categorical>` for more.
 
     See also
     --------
@@ -246,24 +227,39 @@ class Categorical(PandasObject):
     # For comparisons, so that numpy uses our implementation if the compare
     # ops, which raise
     __array_priority__ = 1000
+    _dtype = CategoricalDtype()
     _typ = 'categorical'
 
-    def __init__(self, values, categories=None, ordered=False, fastpath=False):
+    def __init__(self, values, categories=None, ordered=None, dtype=None,
+                 fastpath=False):
 
-        self._validate_ordered(ordered)
+        if dtype is not None:
+            if isinstance(dtype, compat.string_types):
+                if dtype == 'category':
+                    dtype = CategoricalDtype(categories, ordered)
+                else:
+                    raise ValueError("Unknown `dtype` {}".format(dtype))
+            elif categories is not None or ordered is not None:
+                raise ValueError("Cannot specify both `dtype` and `categories`"
+                                 " or `ordered`.")
+
+            categories = dtype.categories
+            ordered = dtype.ordered
+
+        if ordered is None:
+            ordered = False
 
         if fastpath:
-            # fast path
+            if dtype is None:
+                dtype = CategoricalDtype(categories, ordered)
             self._codes = coerce_indexer_dtype(values, categories)
-            self._categories = self._validate_categories(
-                categories, fastpath=isinstance(categories, ABCIndexClass))
-            self._ordered = ordered
+            self._dtype = dtype
             return
 
         # sanitize input
         if is_categorical_dtype(values):
 
-            # we are either a Series or a CategoricalIndex
+            # we are either a Series, CategoricalIndex
             if isinstance(values, (ABCSeries, ABCCategoricalIndex)):
                 values = values._values
 
@@ -313,7 +309,8 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
                 raise NotImplementedError("> 1 ndim Categorical are not "
                                           "supported at this time")
 
-            categories = self._validate_categories(categories)
+            if dtype is None or isinstance(dtype, str):
+                dtype = CategoricalDtype(categories, ordered)
 
         else:
             # there were two ways if categories are present
@@ -325,12 +322,15 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
 
             # make sure that we always have the same type here, no matter what
             # we get passed in
-            categories = self._validate_categories(categories)
-            codes = _get_codes_for_values(values, categories)
+            if dtype is None or isinstance(dtype, str):
+                dtype = CategoricalDtype(categories, ordered)
+
+            codes = _get_codes_for_values(values, dtype.categories)
 
             # TODO: check for old style usage. These warnings should be removes
             # after 0.18/ in 2016
-            if is_integer_dtype(values) and not is_integer_dtype(categories):
+            if (is_integer_dtype(values) and
+                    not is_integer_dtype(dtype.categories)):
                 warn("Values and categories have different dtypes. Did you "
                      "mean to use\n'Categorical.from_codes(codes, "
                      "categories)'?", RuntimeWarning, stacklevel=2)
@@ -341,9 +341,57 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
                      "mean to use\n'Categorical.from_codes(codes, "
                      "categories)'?", RuntimeWarning, stacklevel=2)
 
-        self.set_ordered(ordered or False, inplace=True)
-        self._categories = categories
-        self._codes = coerce_indexer_dtype(codes, categories)
+        self._dtype = dtype
+        self._codes = coerce_indexer_dtype(codes, dtype.categories)
+
+    @property
+    def categories(self):
+        """The categories of this categorical.
+
+        Setting assigns new values to each category (effectively a rename of
+        each individual category).
+
+        The assigned value has to be a list-like object. All items must be
+        unique and the number of items in the new categories must be the same
+        as the number of items in the old categories.
+
+        Assigning to `categories` is a inplace operation!
+
+        Raises
+        ------
+        ValueError
+            If the new categories do not validate as categories or if the
+            number of new categories is unequal the number of old categories
+
+        See also
+        --------
+        rename_categories
+        reorder_categories
+        add_categories
+        remove_categories
+        remove_unused_categories
+        set_categories
+        """
+        return self.dtype.categories
+
+    @categories.setter
+    def categories(self, categories):
+        new_dtype = CategoricalDtype(categories, ordered=self.ordered)
+        if (self.dtype.categories is not None and
+                len(self.dtype.categories) != len(new_dtype.categories)):
+            raise ValueError("new categories need to have the same number of "
+                             "items as the old categories!")
+        self._dtype = new_dtype
+
+    @property
+    def ordered(self):
+        """Whether the categories have an ordered relationship"""
+        return self.dtype.ordered
+
+    @property
+    def dtype(self):
+        """The :ref:`~pandas.api.types.CategoricalDtype` for this instance"""
+        return self._dtype
 
     def __dir__(self):
         # Avoid IPython warnings for deprecated properties
@@ -492,7 +540,7 @@ def from_codes(cls, codes, categories, ordered=False):
             raise ValueError(
                 "codes need to be convertible to an arrays of integers")
 
-        categories = cls._validate_categories(categories)
+        categories = CategoricalDtype._validate_categories(categories)
 
         if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
             raise ValueError("codes need to be between -1 and "
@@ -535,69 +583,6 @@ def _get_labels(self):
 
     labels = property(fget=_get_labels, fset=_set_codes)
 
-    _categories = None
-
-    @classmethod
-    def _validate_ordered(cls, ordered):
-        """
-        Validates that we have a valid ordered parameter. If
-        it is not a boolean, a TypeError will be raised.
-
-        Parameters
-        ----------
-        ordered : object
-            The parameter to be verified.
-
-        Raises
-        ------
-        TypeError
-            If 'ordered' is not a boolean.
-        """
-        if not is_bool(ordered):
-            raise TypeError("'ordered' must either be 'True' or 'False'")
-
-    @classmethod
-    def _validate_categories(cls, categories, fastpath=False):
-        """
-        Validates that we have good categories
-
-        Parameters
-        ----------
-        fastpath : boolean (default: False)
-           Don't perform validation of the categories for uniqueness or nulls
-
-        """
-        if not isinstance(categories, ABCIndexClass):
-            dtype = None
-            if not hasattr(categories, "dtype"):
-                if not is_list_like(categories):
-                    raise TypeError("`categories` must be list-like. "
-                                    "Got {} instead".format(repr(categories)))
-                categories = _convert_to_list_like(categories)
-                # On categories with NaNs, int values would be converted to
-                # float. Use "object" dtype to prevent this.
-                if isna(categories).any():
-                    without_na = np.array([x for x in categories
-                                           if notna(x)])
-                    with_na = np.array(categories)
-                    if with_na.dtype != without_na.dtype:
-                        dtype = "object"
-
-            from pandas import Index
-            categories = Index(categories, dtype=dtype)
-
-        if not fastpath:
-
-            # Categories cannot contain NaN.
-            if categories.hasnans:
-                raise ValueError('Categorial categories cannot be null')
-
-            # Categories must be unique.
-            if not categories.is_unique:
-                raise ValueError('Categorical categories must be unique')
-
-        return categories
-
     def _set_categories(self, categories, fastpath=False):
         """ Sets new categories
 
@@ -608,21 +593,17 @@ def _set_categories(self, categories, fastpath=False):
 
         """
 
-        categories = self._validate_categories(categories, fastpath=fastpath)
-        if (not fastpath and self._categories is not None and
-                len(categories) != len(self._categories)):
+        if fastpath:
+            new_dtype = CategoricalDtype._from_fastpath(categories,
+                                                        self.ordered)
+        else:
+            new_dtype = CategoricalDtype(categories, ordered=self.ordered)
+        if (not fastpath and self.dtype.categories is not None and
+                len(new_dtype.categories) != len(self.dtype.categories)):
             raise ValueError("new categories need to have the same number of "
                              "items than the old categories!")
 
-        self._categories = categories
-
-    def _get_categories(self):
-        """ Gets the categories """
-        # categories is an Index, which is immutable -> no need to copy
-        return self._categories
-
-    categories = property(fget=_get_categories, fset=_set_categories,
-                          doc=_categories_doc)
+        self._dtype = new_dtype
 
     def _codes_for_groupby(self, sort):
         """
@@ -664,7 +645,21 @@ def _codes_for_groupby(self, sort):
 
         return self.reorder_categories(cat.categories)
 
-    _ordered = None
+    def _set_dtype(self, dtype):
+        """Internal method for directly updating the CategoricalDtype
+
+        Parameters
+        ----------
+        dtype : CategoricalDtype
+
+        Notes
+        -----
+        We don't do any validation here. It's assumed that the dtype is
+        a (valid) instance of `CategoricalDtype`.
+        """
+        codes = _recode_for_categories(self.codes, self.categories,
+                                       dtype.categories)
+        return type(self)(codes, dtype=dtype, fastpath=True)
 
     def set_ordered(self, value, inplace=False):
         """
@@ -679,9 +674,9 @@ def set_ordered(self, value, inplace=False):
            of this categorical with ordered set to the value
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        self._validate_ordered(value)
+        new_dtype = CategoricalDtype(self.categories, ordered=value)
         cat = self if inplace else self.copy()
-        cat._ordered = value
+        cat._dtype = new_dtype
         if not inplace:
             return cat
 
@@ -711,12 +706,6 @@ def as_unordered(self, inplace=False):
         inplace = validate_bool_kwarg(inplace, 'inplace')
         return self.set_ordered(False, inplace=inplace)
 
-    def _get_ordered(self):
-        """ Gets the ordered attribute """
-        return self._ordered
-
-    ordered = property(fget=_get_ordered)
-
     def set_categories(self, new_categories, ordered=None, rename=False,
                        inplace=False):
         """ Sets the categories to the specified new_categories.
@@ -769,22 +758,21 @@ def set_categories(self, new_categories, ordered=None, rename=False,
         remove_unused_categories
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        new_categories = self._validate_categories(new_categories)
+        if ordered is None:
+            ordered = self.dtype.ordered
+        new_dtype = CategoricalDtype(new_categories, ordered=ordered)
+
         cat = self if inplace else self.copy()
         if rename:
-            if (cat._categories is not None and
-                    len(new_categories) < len(cat._categories)):
+            if (cat.dtype.categories is not None and
+                    len(new_dtype.categories) < len(cat.dtype.categories)):
                 # remove all _codes which are larger and set to -1/NaN
-                self._codes[self._codes >= len(new_categories)] = -1
+                self._codes[self._codes >= len(new_dtype.categories)] = -1
         else:
             codes = _recode_for_categories(self.codes, self.categories,
-                                           new_categories)
+                                           new_dtype.categories)
             cat._codes = codes
-        cat._categories = new_categories
-
-        if ordered is None:
-            ordered = self.ordered
-        cat.set_ordered(ordered, inplace=True)
+        cat._dtype = new_dtype
 
         if not inplace:
             return cat
@@ -864,7 +852,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False):
         set_categories
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        if set(self._categories) != set(new_categories):
+        if set(self.dtype.categories) != set(new_categories):
             raise ValueError("items in new_categories are not the same as in "
                              "old categories")
         return self.set_categories(new_categories, ordered=ordered,
@@ -905,15 +893,17 @@ def add_categories(self, new_categories, inplace=False):
         inplace = validate_bool_kwarg(inplace, 'inplace')
         if not is_list_like(new_categories):
             new_categories = [new_categories]
-        already_included = set(new_categories) & set(self._categories)
+        already_included = set(new_categories) & set(self.dtype.categories)
         if len(already_included) != 0:
             msg = ("new categories must not include old categories: %s" %
                    str(already_included))
             raise ValueError(msg)
-        new_categories = list(self._categories) + list(new_categories)
+        new_categories = list(self.dtype.categories) + list(new_categories)
+        new_dtype = CategoricalDtype(new_categories, self.ordered)
+
         cat = self if inplace else self.copy()
-        cat._categories = self._validate_categories(new_categories)
-        cat._codes = coerce_indexer_dtype(cat._codes, new_categories)
+        cat._dtype = new_dtype
+        cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)
         if not inplace:
             return cat
 
@@ -953,8 +943,9 @@ def remove_categories(self, removals, inplace=False):
             removals = [removals]
 
         removal_set = set(list(removals))
-        not_included = removal_set - set(self._categories)
-        new_categories = [c for c in self._categories if c not in removal_set]
+        not_included = removal_set - set(self.dtype.categories)
+        new_categories = [c for c in self.dtype.categories
+                          if c not in removal_set]
 
         # GH 10156
         if any(isna(removals)):
@@ -996,8 +987,11 @@ def remove_unused_categories(self, inplace=False):
         if idx.size != 0 and idx[0] == -1:  # na sentinel
             idx, inv = idx[1:], inv - 1
 
-        cat._categories = cat.categories.take(idx)
-        cat._codes = coerce_indexer_dtype(inv, self._categories)
+        new_categories = cat.dtype.categories.take(idx)
+        new_dtype = CategoricalDtype._from_fastpath(new_categories,
+                                                    ordered=self.ordered)
+        cat._dtype = new_dtype
+        cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)
 
         if not inplace:
             return cat
@@ -1098,7 +1092,7 @@ def __setstate__(self, state):
 
         # Provide compatibility with pre-0.15.0 Categoricals.
         if '_categories' not in state and '_levels' in state:
-            state['_categories'] = self._validate_categories(state.pop(
+            state['_categories'] = self.dtype._validate_categories(state.pop(
                 '_levels'))
         if '_codes' not in state and 'labels' in state:
             state['_codes'] = coerce_indexer_dtype(
@@ -1113,6 +1107,11 @@ def __setstate__(self, state):
             else:
                 state['_ordered'] = False
 
+        # 0.21.0 CategoricalDtype change
+        if '_dtype' not in state:
+            state['_dtype'] = CategoricalDtype(state['_categories'],
+                                               state['_ordered'])
+
         for k, v in compat.iteritems(state):
             setattr(self, k, v)
 
@@ -1122,7 +1121,7 @@ def T(self):
 
     @property
     def nbytes(self):
-        return self._codes.nbytes + self._categories.values.nbytes
+        return self._codes.nbytes + self.dtype.categories.values.nbytes
 
     def memory_usage(self, deep=False):
         """
@@ -1147,7 +1146,8 @@ def memory_usage(self, deep=False):
         --------
         numpy.ndarray.nbytes
         """
-        return self._codes.nbytes + self._categories.memory_usage(deep=deep)
+        return self._codes.nbytes + self.dtype.categories.memory_usage(
+            deep=deep)
 
     @Substitution(klass='Categorical')
     @Appender(_shared_docs['searchsorted'])
@@ -1278,7 +1278,7 @@ def value_counts(self, dropna=True):
             count = bincount(np.where(mask, code, ncat))
             ix = np.append(ix, -1)
 
-        ix = self._constructor(ix, categories=cat, ordered=obj.ordered,
+        ix = self._constructor(ix, dtype=self.dtype,
                                fastpath=True)
 
         return Series(count, index=CategoricalIndex(ix), dtype='int64')
@@ -1991,8 +1991,7 @@ def is_dtype_equal(self, other):
         """
 
         try:
-            return (self.categories.equals(other.categories) and
-                    self.ordered == other.ordered)
+            return hash(self.dtype) == hash(other.dtype)
         except (AttributeError, TypeError):
             return False
 
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index c47e61dc446be..f60c0d5ffdca0 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -692,6 +692,40 @@ def is_dtype_equal(source, target):
         return False
 
 
+def is_dtype_union_equal(source, target):
+    """
+    Check whether two arrays have compatible dtypes to do a union.
+    numpy types are checked with ``is_dtype_equal``. Extension types are
+    checked separately.
+
+    Parameters
+    ----------
+    source : The first dtype to compare
+    target : The second dtype to compare
+
+    Returns
+    ----------
+    boolean : Whether or not the two dtypes are equal.
+
+    >>> is_dtype_equal("int", int)
+    True
+
+    >>> is_dtype_equal(CategoricalDtype(['a', 'b'],
+    ...                CategoricalDtype(['b', 'c']))
+    True
+
+    >>> is_dtype_equal(CategoricalDtype(['a', 'b'],
+    ...                CategoricalDtype(['b', 'c'], ordered=True))
+    False
+    """
+    source = _get_dtype(source)
+    target = _get_dtype(target)
+    if is_categorical_dtype(source) and is_categorical_dtype(target):
+        # ordered False for both
+        return source.ordered is target.ordered
+    return is_dtype_equal(source, target)
+
+
 def is_any_int_dtype(arr_or_dtype):
     """
     DEPRECATED: This function will be removed in a future version.
@@ -1671,7 +1705,9 @@ def _coerce_to_dtype(dtype):
     """
 
     if is_categorical_dtype(dtype):
-        dtype = CategoricalDtype()
+        categories = getattr(dtype, 'categories', None)
+        ordered = getattr(dtype, 'ordered', False)
+        dtype = CategoricalDtype(categories=categories, ordered=ordered)
     elif is_datetime64tz_dtype(dtype):
         dtype = DatetimeTZDtype(dtype)
     elif is_period_dtype(dtype):
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index dc2c56ea476f9..8be7870be67f2 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -3,6 +3,7 @@
 import re
 import numpy as np
 from pandas import compat
+from pandas.core.dtypes.generic import ABCIndexClass
 
 
 class ExtensionDtype(object):
@@ -110,37 +111,148 @@ class CategoricalDtypeType(type):
 class CategoricalDtype(ExtensionDtype):
 
     """
-    A np.dtype duck-typed class, suitable for holding a custom categorical
-    dtype.
-
-    THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object
+    Type for categorical data with the categories and orderedness
+
+    .. versionchanged:: 0.21.0
+
+    Parameters
+    ----------
+    categories : sequence, optional
+        Must be unique, and must not contain any nulls.
+    ordered : bool, default False
+
+    Notes
+    -----
+    This class is useful for specifying the type of a ``Categorical``
+    independent of the values. See :ref:`categorical.categoricaldtype`
+    for more.
+
+    Examples
+    --------
+    >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True)
+    >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
+    0      a
+    1      b
+    2      a
+    3    NaN
+    dtype: category
+    Categories (2, object): [b < a]
+
+    See Also
+    --------
+    Categorical
     """
+    # TODO: Document public vs. private API
     name = 'category'
     type = CategoricalDtypeType
     kind = 'O'
     str = '|O08'
     base = np.dtype('O')
-    _metadata = []
+    _metadata = ['categories', 'ordered']
     _cache = {}
 
-    def __new__(cls):
+    def __init__(self, categories=None, ordered=False):
+        self._finalize(categories, ordered, fastpath=False)
 
-        try:
-            return cls._cache[cls.name]
-        except KeyError:
-            c = object.__new__(cls)
-            cls._cache[cls.name] = c
-            return c
+    @classmethod
+    def _from_fastpath(cls, categories=None, ordered=False):
+        self = cls.__new__(cls)
+        self._finalize(categories, ordered, fastpath=True)
+        return self
+
+    def _finalize(self, categories, ordered, fastpath=False):
+        from pandas.core.indexes.base import Index
+
+        if categories is not None:
+            categories = Index(categories, tupleize_cols=False)
+            # validation
+            self._validate_categories(categories)
+            self._validate_ordered(ordered)
+        self._categories = categories
+        self._ordered = ordered
+
+    def __setstate__(self, state):
+        self._categories = state.pop('categories', None)
+        self._ordered = state.pop('ordered', False)
 
     def __hash__(self):
-        # make myself hashable
-        return hash(str(self))
+        # _hash_categories returns a uint64, so use the negative
+        # space for when we have unknown categories to avoid a conflict
+        if self.categories is None:
+            if self.ordered:
+                return -1
+            else:
+                return -2
+        # We *do* want to include the real self.ordered here
+        return int(self._hash_categories(self.categories, self.ordered))
 
     def __eq__(self, other):
         if isinstance(other, compat.string_types):
             return other == self.name
 
-        return isinstance(other, CategoricalDtype)
+        if not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
+            return False
+        elif self.categories is None or other.categories is None:
+            # We're forced into a suboptimal corner thanks to math and
+            # backwards compatibility. We require that `CDT(...) == 'category'`
+            # for all CDTs **including** `CDT(None, ...)`. Therefore, *all*
+            # CDT(., .) = CDT(None, False) and *all*
+            # CDT(., .) = CDT(None, True).
+            return True
+        elif self.ordered:
+            return other.ordered and self.categories.equals(other.categories)
+        elif other.ordered:
+            return False
+        else:
+            # both unordered; this could probably be optimized / cached
+            return hash(self) == hash(other)
+
+    def __unicode__(self):
+        tpl = u'CategoricalDtype(categories={}ordered={})'
+        if self.categories is None:
+            data = u"None, "
+        else:
+            data = self.categories._format_data(name=self.__class__.__name__)
+        return tpl.format(data, self.ordered)
+
+    @staticmethod
+    def _hash_categories(categories, ordered=True):
+        from pandas.core.util.hashing import (
+            hash_array, _combine_hash_arrays, hash_tuples
+        )
+
+        if len(categories) and isinstance(categories[0], tuple):
+            # assumes if any individual category is a tuple, then all our. ATM
+            # I don't really want to support just some of the categories being
+            # tuples.
+            categories = list(categories)  # breaks if a np.array of categories
+            cat_array = hash_tuples(categories)
+        else:
+            if categories.dtype == 'O':
+                types = [type(x) for x in categories]
+                if not len(set(types)) == 1:
+                    # TODO: hash_array doesn't handle mixed types. It casts
+                    # everything to a str first, which means we treat
+                    # {'1', '2'} the same as {'1', 2}
+                    # find a better solution
+                    cat_array = np.array([hash(x) for x in categories])
+                    hashed = hash((tuple(categories), ordered))
+                    return hashed
+            cat_array = hash_array(np.asarray(categories), categorize=False)
+        if ordered:
+            cat_array = np.vstack([
+                cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)
+            ])
+        else:
+            cat_array = [cat_array]
+        hashed = _combine_hash_arrays(iter(cat_array),
+                                      num_items=len(cat_array))
+        if len(hashed) == 0:
+            # bug in Numpy<1.12 for length 0 arrays. Just return the correct
+            # value of 0
+            return 0
+        else:
+            return np.bitwise_xor.reduce(hashed)
 
     @classmethod
     def construct_from_string(cls, string):
@@ -154,6 +266,68 @@ def construct_from_string(cls, string):
 
         raise TypeError("cannot construct a CategoricalDtype")
 
+    @staticmethod
+    def _validate_ordered(ordered):
+        """
+        Validates that we have a valid ordered parameter. If
+        it is not a boolean, a TypeError will be raised.
+
+        Parameters
+        ----------
+        ordered : object
+            The parameter to be verified.
+
+        Raises
+        ------
+        TypeError
+            If 'ordered' is not a boolean.
+        """
+        from pandas.core.dtypes.common import is_bool
+        if not is_bool(ordered):
+            raise TypeError("'ordered' must either be 'True' or 'False'")
+
+    @staticmethod
+    def _validate_categories(categories, fastpath=False):
+        """
+        Validates that we have good categories
+
+        Parameters
+        ----------
+        categories : array-like
+        fastpath : bool
+            Whether to skip nan and uniqueness checks
+
+        Returns
+        -------
+        categories : Index
+        """
+        from pandas import Index
+
+        if not isinstance(categories, ABCIndexClass):
+            categories = Index(categories)
+
+        if not fastpath:
+
+            if categories.hasnans:
+                raise ValueError('Categorial categories cannot be null')
+
+            if not categories.is_unique:
+                raise ValueError('Categorical categories must be unique')
+
+        return categories
+
+    @property
+    def categories(self):
+        """
+        An ``Index`` containing the unique categories allowed.
+        """
+        return self._categories
+
+    @property
+    def ordered(self):
+        """Whether the categories have an ordered relationship"""
+        return self._ordered
+
 
 class DatetimeTZDtypeType(type):
     """
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 008828cf4f309..3ce7403f8d726 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -27,6 +27,7 @@
     is_integer,
     is_float,
     is_dtype_equal,
+    is_dtype_union_equal,
     is_object_dtype,
     is_categorical_dtype,
     is_interval_dtype,
@@ -847,7 +848,7 @@ def _formatter_func(self):
         """
         return default_pprint
 
-    def _format_data(self):
+    def _format_data(self, name=None):
         """
         Return the formatted data as a unicode string
         """
@@ -856,9 +857,11 @@ def _format_data(self):
         display_width, _ = get_console_size()
         if display_width is None:
             display_width = get_option('display.width') or 80
+        if name is None:
+            name = self.__class__.__name__
 
-        space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
-        space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2))
+        space1 = "\n%s" % (' ' * (len(name) + 1))
+        space2 = "\n%s" % (' ' * (len(name) + 2))
 
         n = len(self)
         sep = ','
@@ -2170,7 +2173,11 @@ def union(self, other):
         if len(self) == 0:
             return other._get_consensus_name(self)
 
-        if not is_dtype_equal(self.dtype, other.dtype):
+        # TODO: is_dtype_union_equal is a hack around
+        # 1. buggy set ops with duplicates (GH #13432)
+        # 2. CategoricalIndex lacking setops (GH #10186)
+        # Once those are fixed, this workaround can be removed
+        if not is_dtype_union_equal(self.dtype, other.dtype):
             this = self.astype('O')
             other = other.astype('O')
             return this.union(other)
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index ef1dc4d971f37..5464bc10b18e5 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -58,16 +58,18 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
                 copy=False, name=None, fastpath=False, **kwargs):
 
         if fastpath:
-            return cls._simple_new(data, name=name)
+            return cls._simple_new(data, name=name, dtype=dtype)
 
         if name is None and hasattr(data, 'name'):
             name = data.name
 
         if isinstance(data, ABCCategorical):
-            data = cls._create_categorical(cls, data, categories, ordered)
+            data = cls._create_categorical(cls, data, categories, ordered,
+                                           dtype)
         elif isinstance(data, CategoricalIndex):
             data = data._data
-            data = cls._create_categorical(cls, data, categories, ordered)
+            data = cls._create_categorical(cls, data, categories, ordered,
+                                           dtype)
         else:
 
             # don't allow scalars
@@ -114,7 +116,8 @@ def _create_from_codes(self, codes, categories=None, ordered=None,
         return CategoricalIndex(cat, name=name)
 
     @staticmethod
-    def _create_categorical(self, data, categories=None, ordered=None):
+    def _create_categorical(self, data, categories=None, ordered=None,
+                            dtype=None):
         """
         *this is an internal non-public method*
 
@@ -125,6 +128,7 @@ def _create_categorical(self, data, categories=None, ordered=None):
         data : data for new Categorical
         categories : optional categories, defaults to existing
         ordered : optional ordered attribute, defaults to existing
+        dtype : CategoricalDtype, defaults to existing
 
         Returns
         -------
@@ -135,22 +139,30 @@ def _create_categorical(self, data, categories=None, ordered=None):
             data = data.values
 
         if not isinstance(data, ABCCategorical):
-            ordered = False if ordered is None else ordered
+            if ordered is None and dtype is None:
+                ordered = False
             from pandas.core.categorical import Categorical
-            data = Categorical(data, categories=categories, ordered=ordered)
+            data = Categorical(data, categories=categories, ordered=ordered,
+                               dtype=dtype)
         else:
+            from pandas.core.dtypes.dtypes import CategoricalDtype
+
             if categories is not None:
-                data = data.set_categories(categories)
-            if ordered is not None:
+                data = data.set_categories(categories, ordered=ordered)
+            elif ordered is not None and ordered != data.ordered:
                 data = data.set_ordered(ordered)
+            if isinstance(dtype, CategoricalDtype):
+                # we want to silently ignore dtype='category'
+                data = data._set_dtype(dtype)
         return data
 
     @classmethod
     def _simple_new(cls, values, name=None, categories=None, ordered=None,
-                    **kwargs):
+                    dtype=None, **kwargs):
         result = object.__new__(cls)
 
-        values = cls._create_categorical(cls, values, categories, ordered)
+        values = cls._create_categorical(cls, values, categories, ordered,
+                                         dtype=dtype)
         result._data = values
         result.name = name
         for k, v in compat.iteritems(kwargs):
@@ -161,16 +173,28 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None,
 
     @Appender(_index_shared_docs['_shallow_copy'])
     def _shallow_copy(self, values=None, categories=None, ordered=None,
-                      **kwargs):
+                      dtype=None, **kwargs):
         # categories and ordered can't be part of attributes,
         # as these are properties
+        # we want to reuse self.dtype if possible, i.e. neither are
+        # overridden.
+        if dtype is not None and (categories is not None or
+                                  ordered is not None):
+            raise TypeError("Cannot specify both `dtype` and `categories` "
+                            "or `ordered`")
+
+        if categories is None and ordered is None:
+            dtype = self.dtype if dtype is None else dtype
+            return super(CategoricalIndex, self)._shallow_copy(
+                values=values, dtype=dtype, **kwargs)
         if categories is None:
             categories = self.categories
         if ordered is None:
             ordered = self.ordered
-        return super(CategoricalIndex,
-                     self)._shallow_copy(values=values, categories=categories,
-                                         ordered=ordered, **kwargs)
+
+        return super(CategoricalIndex, self)._shallow_copy(
+            values=values, categories=categories,
+            ordered=ordered, **kwargs)
 
     def _is_dtype_compat(self, other):
         """
@@ -236,7 +260,7 @@ def _format_attrs(self):
             ('ordered', self.ordered)]
         if self.name is not None:
             attrs.append(('name', ibase.default_pprint(self.name)))
-        attrs.append(('dtype', "'%s'" % self.dtype))
+        attrs.append(('dtype', "'%s'" % self.dtype.name))
         max_seq_items = get_option('display.max_seq_items') or len(self)
         if len(self) > max_seq_items:
             attrs.append(('length', len(self)))
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index c0a9c139722f5..c36ef020faf31 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -950,9 +950,10 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs):
                                       na_rep=na_rep,
                                       justify='all').get_result()
 
-    def _format_data(self):
+    def _format_data(self, name=None):
 
         # TODO: integrate with categorical and make generic
+        # name argument is unused here; just for compat with base / categorical
         n = len(self)
         max_seq_items = min((get_option(
             'display.max_seq_items') or n) // 10, 10)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index ea613a27b6521..9de69c9c3e97c 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -490,7 +490,7 @@ def _format_attrs(self):
     def _format_space(self):
         return "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
 
-    def _format_data(self):
+    def _format_data(self, name=None):
         # we are formatting thru the attributes
         return None
 
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index b759abaed4e56..81600f1baa842 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -189,7 +189,7 @@ def _format_attrs(self):
             attrs.append(('name', ibase.default_pprint(self.name)))
         return attrs
 
-    def _format_data(self):
+    def _format_data(self, name=None):
         # we are formatting thru the attributes
         return None
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 83b382ec0ed72..e510ca87e44aa 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -139,14 +139,14 @@ def is_categorical_astype(self, dtype):
         validate that we have a astypeable to categorical,
         returns a boolean if we are a categorical
         """
-        if is_categorical_dtype(dtype):
-            if dtype == CategoricalDtype():
-                return True
-
+        if dtype is Categorical or dtype is CategoricalDtype:
             # this is a pd.Categorical, but is not
             # a valid type for astypeing
             raise TypeError("invalid type {0} for astype".format(dtype))
 
+        elif is_categorical_dtype(dtype):
+            return True
+
         return False
 
     def external_values(self, dtype=None):
@@ -548,6 +548,18 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
         # may need to convert to categorical
         # this is only called for non-categoricals
         if self.is_categorical_astype(dtype):
+            if (('categories' in kwargs or 'ordered' in kwargs) and
+                    isinstance(dtype, CategoricalDtype)):
+                raise TypeError("Cannot specify a CategoricalDtype and also "
+                                "`categories` or `ordered`. Use "
+                                "`dtype=CategoricalDtype(categories, ordered)`"
+                                " instead.")
+            kwargs = kwargs.copy()
+            categories = getattr(dtype, 'categories', None)
+            ordered = getattr(dtype, 'ordered', False)
+
+            kwargs.setdefault('categories', categories)
+            kwargs.setdefault('ordered', ordered)
             return self.make_block(Categorical(self.values, **kwargs))
 
         # astype processing
diff --git a/pandas/core/series.py b/pandas/core/series.py
index ac11c5f908fdc..bc84bd09f0b44 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -2987,7 +2987,8 @@ def _try_cast(arr, take_fast_path):
                 subarr = np.array(subarr, dtype=dtype, copy=copy)
         except (ValueError, TypeError):
             if is_categorical_dtype(dtype):
-                subarr = Categorical(arr)
+                subarr = Categorical(arr, dtype.categories,
+                                     ordered=dtype.ordered)
             elif dtype is not None and raise_cast_failure:
                 raise
             else:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 12e8d8aba9177..27252b9616a44 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 from pandas.compat import long, string_types, PY3
-from pandas.core.categorical import Categorical
 from pandas.core.dtypes.common import (
     _ensure_platform_int,
     _ensure_int64,
@@ -183,6 +182,8 @@ def indexer_from_factorized(labels, shape, compress=True):
 
 
 def lexsort_indexer(keys, orders=None, na_position='last'):
+    from pandas.core.categorical import Categorical
+
     labels = []
     shape = []
     if isinstance(orders, bool):
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 07e993d7ef509..0c82773b75c28 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -260,7 +260,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
 
     # For categoricals, we hash the categories, then remap the codes to the
     # hash values. (This check is above the complex check so that we don't ask
-    # numpy if categorical is a subdtype of complex, as it will choke.
+    # numpy if categorical is a subdtype of complex, as it will choke).
     if is_categorical_dtype(dtype):
         return _hash_categorical(vals, encoding, hash_key)
 
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 8a36f234484b4..e0be34b14a97d 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -545,10 +545,12 @@ def test_is_complex_dtype():
     (pd.Index([1, 2]), np.dtype('int64')),
     (pd.Index(['a', 'b']), np.dtype(object)),
     ('category', 'category'),
-    (pd.Categorical(['a', 'b']).dtype, CategoricalDtype()),
-    (pd.Categorical(['a', 'b']), CategoricalDtype()),
-    (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype()),
-    (pd.CategoricalIndex(['a', 'b']), CategoricalDtype()),
+    (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])),
+    (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])),
+    (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])),
+    (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])),
+    (CategoricalDtype(), CategoricalDtype()),
+    (CategoricalDtype(['a', 'b']), CategoricalDtype()),
     (pd.DatetimeIndex([1, 2]), np.dtype('<M8[ns]')),
     (pd.DatetimeIndex([1, 2]).dtype, np.dtype('<M8[ns]')),
     ('<M8[ns]', np.dtype('<M8[ns]')),
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index fb20571213c15..ce06f013dc85d 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -66,21 +66,13 @@ def test_pickle(self):
 
         # force back to the cache
         result = tm.round_trip_pickle(self.dtype)
-
-        # we are a singular object so we are added
-        # back to the cache upon unpickling
-        # this is to ensure object identity
-        assert len(self.dtype._cache) == 1
         assert result == self.dtype
 
     def test_hash_vs_equality(self):
-        # make sure that we satisfy is semantics
         dtype = self.dtype
         dtype2 = CategoricalDtype()
         assert dtype == dtype2
         assert dtype2 == dtype
-        assert dtype is dtype2
-        assert dtype2 is dtype
         assert hash(dtype) == hash(dtype2)
 
     def test_equality(self):
@@ -94,6 +86,11 @@ def test_construction_from_string(self):
         pytest.raises(
             TypeError, lambda: CategoricalDtype.construct_from_string('foo'))
 
+    def test_constructor_invalid(self):
+        with tm.assert_raises_regex(TypeError,
+                                    "CategoricalIndex.* must be called"):
+            CategoricalDtype("category")
+
     def test_is_dtype(self):
         assert CategoricalDtype.is_dtype(self.dtype)
         assert CategoricalDtype.is_dtype('category')
@@ -119,6 +116,11 @@ def test_basic(self):
         assert not is_categorical(np.dtype('float64'))
         assert not is_categorical(1.0)
 
+    def test_tuple_categories(self):
+        categories = [(1, 'a'), (2, 'b'), (3, 'c')]
+        result = CategoricalDtype(categories)
+        assert all(result.categories == categories)
+
 
 class TestDatetimeTZDtype(Base):
 
@@ -524,3 +526,96 @@ def test_caching(self):
         IntervalDtype.reset_cache()
         tm.round_trip_pickle(dtype)
         assert len(IntervalDtype._cache) == 0
+
+
+class TestCategoricalDtypeParametrized(object):
+
+    @pytest.mark.parametrize('categories, ordered', [
+        (['a', 'b', 'c', 'd'], False),
+        (['a', 'b', 'c', 'd'], True),
+        (np.arange(1000), False),
+        (np.arange(1000), True),
+        (['a', 'b', 10, 2, 1.3, True], False),
+        ([True, False], True),
+        ([True, False], False),
+        (pd.date_range('2017', periods=4), True),
+        (pd.date_range('2017', periods=4), False),
+    ])
+    def test_basic(self, categories, ordered):
+        c1 = CategoricalDtype(categories, ordered=ordered)
+        tm.assert_index_equal(c1.categories, pd.Index(categories))
+        assert c1.ordered is ordered
+
+    def test_order_matters(self):
+        categories = ['a', 'b']
+        c1 = CategoricalDtype(categories, ordered=False)
+        c2 = CategoricalDtype(categories, ordered=True)
+        assert c1 is not c2
+
+    def test_unordered_same(self):
+        c1 = CategoricalDtype(['a', 'b'])
+        c2 = CategoricalDtype(['b', 'a'])
+        assert hash(c1) == hash(c2)
+
+    def test_categories(self):
+        result = CategoricalDtype(['a', 'b', 'c'])
+        tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
+        assert result.ordered is False
+
+    def test_equal_but_different(self):
+        c1 = CategoricalDtype([1, 2, 3])
+        c2 = CategoricalDtype([1., 2., 3.])
+        assert c1 is not c2
+        assert c1 != c2
+
+    @pytest.mark.parametrize('v1, v2', [
+        ([1, 2, 3], [1, 2, 3]),
+        ([1, 2, 3], [3, 2, 1]),
+    ])
+    def test_order_hashes_different(self, v1, v2):
+        c1 = CategoricalDtype(v1)
+        c2 = CategoricalDtype(v2, ordered=True)
+        assert c1 is not c2
+
+    def test_nan_invalid(self):
+        with pytest.raises(ValueError):
+            CategoricalDtype([1, 2, np.nan])
+
+    def test_non_unique_invalid(self):
+        with pytest.raises(ValueError):
+            CategoricalDtype([1, 2, 1])
+
+    def test_same_categories_different_order(self):
+        c1 = CategoricalDtype(['a', 'b'], ordered=True)
+        c2 = CategoricalDtype(['b', 'a'], ordered=True)
+        assert c1 is not c2
+
+    @pytest.mark.parametrize('ordered, other, expected', [
+        (True, CategoricalDtype(['a', 'b'], True), True),
+        (False, CategoricalDtype(['a', 'b'], False), True),
+        (True, CategoricalDtype(['a', 'b'], False), False),
+        (False, CategoricalDtype(['a', 'b'], True), False),
+        (True, CategoricalDtype([1, 2], False), False),
+        (False, CategoricalDtype([1, 2], True), False),
+        (False, CategoricalDtype(None, True), True),
+        (True, CategoricalDtype(None, True), True),
+        (False, CategoricalDtype(None, False), True),
+        (True, CategoricalDtype(None, False), True),
+        (True, 'category', True),
+        (False, 'category', True),
+        (True, 'not a category', False),
+        (False, 'not a category', False),
+    ])
+    def test_categorical_equality(self, ordered, other, expected):
+        c1 = CategoricalDtype(['a', 'b'], ordered)
+        result = c1 == other
+        assert result == expected
+
+    def test_invalid_raises(self):
+        with tm.assert_raises_regex(TypeError, 'ordered'):
+            CategoricalDtype(['a', 'b'], ordered='foo')
+
+    def test_mixed(self):
+        a = CategoricalDtype(['a', 'b', 1, 2])
+        b = CategoricalDtype(['a', 'b', '1', '2'])
+        assert hash(a) != hash(b)
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 93514a8a42215..6e9b531dec566 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -2082,6 +2082,9 @@ def test_n_error(self, df_main_dtypes, method, columns):
         df = df_main_dtypes
         error_msg = self.dtype_error_msg_template.format(
             column=columns[1], method=method, dtype=df[columns[1]].dtype)
+        # escape some characters that may be in the repr
+        error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
+                              .replace("[", "\\[").replace("]", "\\]"))
         with tm.assert_raises_regex(TypeError, error_msg):
             getattr(df, method)(2, columns)
 
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
index cf365465763fa..d8ec23b9c7e0e 100644
--- a/pandas/tests/indexes/test_category.py
+++ b/pandas/tests/indexes/test_category.py
@@ -654,7 +654,11 @@ def test_equals_categorical(self):
         # make sure that we are testing for category inclusion properly
         ci = CategoricalIndex(list('aabca'), categories=['c', 'a', 'b'])
         assert not ci.equals(list('aabca'))
-        assert not ci.equals(CategoricalIndex(list('aabca')))
+        # Same categories, but different order
+        # Unordered
+        assert ci.equals(CategoricalIndex(list('aabca')))
+        # Ordered
+        assert not ci.equals(CategoricalIndex(list('aabca'), ordered=True))
         assert ci.equals(ci.copy())
 
         ci = CategoricalIndex(list('aabca') + [np.nan],
@@ -666,7 +670,9 @@ def test_equals_categorical(self):
         ci = CategoricalIndex(list('aabca') + [np.nan],
                               categories=['c', 'a', 'b'])
         assert not ci.equals(list('aabca') + [np.nan])
-        assert not ci.equals(CategoricalIndex(list('aabca') + [np.nan]))
+        assert ci.equals(CategoricalIndex(list('aabca') + [np.nan]))
+        assert not ci.equals(CategoricalIndex(list('aabca') + [np.nan],
+                                              ordered=True))
         assert ci.equals(ci.copy())
 
     def test_string_categorical_index_repr(self):
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
index e097194674cf6..dab56e264b955 100644
--- a/pandas/tests/io/json/test_json_table_schema.py
+++ b/pandas/tests/io/json/test_json_table_schema.py
@@ -164,7 +164,10 @@ def test_as_json_table_type_string_dtypes(self):
             assert as_json_table_type(t) == 'string'
 
     def test_as_json_table_type_categorical_dtypes(self):
-        assert as_json_table_type(pd.Categorical) == 'any'
+        # TODO: I think before is_categorical_dtype(Categorical)
+        # returned True, but now it's False. Figure out why or
+        # if it matters
+        assert as_json_table_type(pd.Categorical(['a'])) == 'any'
         assert as_json_table_type(CategoricalDtype()) == 'any'
 
 
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 2d69f6d38475e..4382958474c9f 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pandas as pd
 from pandas.compat import PY3
+from distutils.version import LooseVersion
+from pandas.compat import PY3, is_platform_windows
 from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
                                PyArrowImpl, FastParquetImpl)
 from pandas.util import testing as tm
@@ -394,6 +396,8 @@ def test_unsupported(self, fp):
         self.check_error_on_write(df, fp, ValueError)
 
     def test_categorical(self, fp):
+        if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"):
+            pytest.skip("CategoricalDtype not supported for older fp")
         df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
         self.check_round_trip(df, fp, compression=None)
 
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
index f331378b654be..2a6d16fb39cc3 100644
--- a/pandas/tests/io/test_pytables.py
+++ b/pandas/tests/io/test_pytables.py
@@ -18,6 +18,7 @@
 
 from pandas.compat import is_platform_windows, PY3, PY35, BytesIO, text_type
 from pandas.io.formats.printing import pprint_thing
+from pandas.core.dtypes.common import is_categorical_dtype
 
 tables = pytest.importorskip('tables')
 from pandas.io.pytables import TableIterator
@@ -1090,7 +1091,12 @@ def roundtrip(s, key='data', encoding='latin-1', nan_rep=''):
                          nan_rep=nan_rep)
                 retr = read_hdf(store, key)
                 s_nan = s.replace(nan_rep, np.nan)
-                assert_series_equal(s_nan, retr, check_categorical=False)
+                if is_categorical_dtype(s_nan):
+                    assert is_categorical_dtype(retr)
+                    assert_series_equal(s_nan, retr, check_dtype=False,
+                                        check_categorical=False)
+                else:
+                    assert_series_equal(s_nan, retr)
 
         for s in examples:
             roundtrip(s)
@@ -4845,7 +4851,7 @@ def test_categorical(self):
             # Make sure the metadata is OK
             info = store.info()
             assert '/df2   ' in info
-            assert '/df2/meta/values_block_0/meta' in info
+            # assert '/df2/meta/values_block_0/meta' in info
             assert '/df2/meta/values_block_1/meta' in info
 
             # unordered
diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py
index 338596d1523e4..df75983a29d80 100644
--- a/pandas/tests/reshape/test_merge.py
+++ b/pandas/tests/reshape/test_merge.py
@@ -1468,8 +1468,6 @@ def test_other_columns(self, left, right):
 
     @pytest.mark.parametrize(
         'change', [lambda x: x,
-                   lambda x: x.astype('category',
-                                      categories=['bar', 'foo']),
                    lambda x: x.astype('category',
                                       categories=['foo', 'bar', 'bah']),
                    lambda x: x.astype('category', ordered=True)])
@@ -1481,7 +1479,7 @@ def test_dtype_on_merged_different(self, change, how, left, right):
         X = change(right.X.astype('object'))
         right = right.assign(X=X)
         assert is_categorical_dtype(left.X.values)
-        assert not left.X.values.is_dtype_equal(right.X.values)
+        # assert not left.X.values.is_dtype_equal(right.X.values)
 
         merged = pd.merge(left, right, on='X', how=how)
 
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index f1d044f7a1132..914181dc94154 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -1756,7 +1756,6 @@ class TestNLargestNSmallest(object):
               # not supported on some archs
               # Series([3., 2, 1, 2, 5], dtype='complex256'),
               Series([3., 2, 1, 2, 5], dtype='complex128'),
-              Series(list('abcde'), dtype='category'),
               Series(list('abcde'))])
     def test_error(self, r):
         dt = r.dtype
@@ -1768,6 +1767,16 @@ def test_error(self, r):
             with tm.assert_raises_regex(TypeError, msg):
                 method(arg)
 
+    def test_error_categorical_dtype(self):
+        # same as test_error, but regex hard to escape properly
+        msg = ("Cannot use method 'n(larg|small)est' with dtype "
+               "CategoricalDtype.+")
+        with tm.assert_raises_regex(TypeError, msg):
+            Series(list('ab'), dtype='category').nlargest(2)
+
+        with tm.assert_raises_regex(TypeError, msg):
+            Series(list('ab'), dtype='category').nsmallest(2)
+
     @pytest.mark.parametrize(
         "s",
         [v for k, v in s_main_dtypes().iteritems()])
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 3b95c2803dd9e..df7d7a946e881 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -10,6 +10,7 @@
 import numpy.ma as ma
 import pandas as pd
 
+from pandas.api.types import CategoricalDtype
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
     is_datetime64tz_dtype)
@@ -157,6 +158,26 @@ def test_constructor_categorical(self):
         assert is_categorical_dtype(s)
         assert is_categorical_dtype(s.dtype)
 
+    def test_constructor_categorical_dtype(self):
+        result = pd.Series(['a', 'b'],
+                           dtype=CategoricalDtype(['a', 'b', 'c'],
+                                                  ordered=True))
+        assert is_categorical_dtype(result) is True
+        tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c']))
+        assert result.cat.ordered
+
+        result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a']))
+        assert is_categorical_dtype(result)
+        tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
+        assert result.cat.ordered is False
+
+    def test_unordered_compare_equal(self):
+        left = pd.Series(['a', 'b', 'c'],
+                         dtype=CategoricalDtype(['a', 'b']))
+        right = pd.Series(pd.Categorical(['a', 'b', np.nan],
+                                         categories=['a', 'b']))
+        tm.assert_series_equal(left, right)
+
     def test_constructor_maskedarray(self):
         data = ma.masked_all((3, ), dtype=float)
         result = Series(data)
diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
index fa9feb016726e..3099c02e4aabd 100644
--- a/pandas/tests/series/test_dtypes.py
+++ b/pandas/tests/series/test_dtypes.py
@@ -12,7 +12,11 @@
 from numpy import nan
 import numpy as np
 
-from pandas import Series, Timestamp, Timedelta, DataFrame, date_range
+from pandas import (
+    Series, Timestamp, Timedelta, DataFrame, date_range,
+    Categorical, Index
+)
+from pandas.api.types import CategoricalDtype
 
 from pandas.compat import lrange, range, u
 from pandas import compat
@@ -182,6 +186,34 @@ def test_astype_dict_like(self, dtype_class):
         with pytest.raises(KeyError):
             s.astype(dt5)
 
+    def test_astype_categoricaldtype(self):
+        s = Series(['a', 'b', 'a'])
+        result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
+        expected = Series(Categorical(['a', 'b', 'a'], ordered=True))
+        tm.assert_series_equal(result, expected)
+
+        result = s.astype(CategoricalDtype(['a', 'b'], ordered=False))
+        expected = Series(Categorical(['a', 'b', 'a'], ordered=False))
+        tm.assert_series_equal(result, expected)
+
+        result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False))
+        expected = Series(Categorical(['a', 'b', 'a'],
+                                      categories=['a', 'b', 'c'],
+                                      ordered=False))
+        tm.assert_series_equal(result, expected)
+        tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c']))
+
+    def test_astype_categoricaldtype_with_args(self):
+        s = Series(['a', 'b'])
+        type_ = CategoricalDtype(['a', 'b'])
+
+        with pytest.raises(TypeError):
+            s.astype(type_, ordered=True)
+        with pytest.raises(TypeError):
+            s.astype(type_, categories=['a', 'b'])
+        with pytest.raises(TypeError):
+            s.astype(type_, categories=['a', 'b'], ordered=False)
+
     def test_astype_generic_timestamp_deprecated(self):
         # see gh-15524
         data = [1]
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index b26089ea7a822..3694bba594adb 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -760,55 +760,57 @@ def test_duplicated_with_nas(self):
         expected = np.array(trues + trues)
         tm.assert_numpy_array_equal(result, expected)
 
-    def test_numeric_object_likes(self):
-        cases = [np.array([1, 2, 1, 5, 3,
-                           2, 4, 1, 5, 6]),
-                 np.array([1.1, 2.2, 1.1, np.nan, 3.3,
-                           2.2, 4.4, 1.1, np.nan, 6.6]),
-                 np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
-                           2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]),
-                 np.array(['a', 'b', 'a', 'e', 'c',
-                           'b', 'd', 'a', 'e', 'f'], dtype=object),
-                 np.array([1, 2**63, 1, 3**5, 10,
-                           2**63, 39, 1, 3**5, 7], dtype=np.uint64)]
-
+    @pytest.mark.parametrize('case', [
+        np.array([1, 2, 1, 5, 3,
+                  2, 4, 1, 5, 6]),
+        np.array([1.1, 2.2, 1.1, np.nan, 3.3,
+                  2.2, 4.4, 1.1, np.nan, 6.6]),
+        pytest.mark.xfail(resaon="Complex bug. GH 16399")(
+            np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
+                     2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j])
+        ),
+        np.array(['a', 'b', 'a', 'e', 'c',
+                  'b', 'd', 'a', 'e', 'f'], dtype=object),
+        np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7],
+                 dtype=np.uint64),
+    ])
+    def test_numeric_object_likes(self, case):
         exp_first = np.array([False, False, True, False, False,
                               True, False, True, True, False])
         exp_last = np.array([True, True, True, True, False,
                              False, False, False, False, False])
         exp_false = exp_first | exp_last
 
-        for case in cases:
-            res_first = algos.duplicated(case, keep='first')
-            tm.assert_numpy_array_equal(res_first, exp_first)
+        res_first = algos.duplicated(case, keep='first')
+        tm.assert_numpy_array_equal(res_first, exp_first)
 
-            res_last = algos.duplicated(case, keep='last')
-            tm.assert_numpy_array_equal(res_last, exp_last)
+        res_last = algos.duplicated(case, keep='last')
+        tm.assert_numpy_array_equal(res_last, exp_last)
 
-            res_false = algos.duplicated(case, keep=False)
-            tm.assert_numpy_array_equal(res_false, exp_false)
+        res_false = algos.duplicated(case, keep=False)
+        tm.assert_numpy_array_equal(res_false, exp_false)
 
-            # index
-            for idx in [pd.Index(case), pd.Index(case, dtype='category')]:
-                res_first = idx.duplicated(keep='first')
-                tm.assert_numpy_array_equal(res_first, exp_first)
+        # index
+        for idx in [pd.Index(case), pd.Index(case, dtype='category')]:
+            res_first = idx.duplicated(keep='first')
+            tm.assert_numpy_array_equal(res_first, exp_first)
 
-                res_last = idx.duplicated(keep='last')
-                tm.assert_numpy_array_equal(res_last, exp_last)
+            res_last = idx.duplicated(keep='last')
+            tm.assert_numpy_array_equal(res_last, exp_last)
 
-                res_false = idx.duplicated(keep=False)
-                tm.assert_numpy_array_equal(res_false, exp_false)
+            res_false = idx.duplicated(keep=False)
+            tm.assert_numpy_array_equal(res_false, exp_false)
 
-            # series
-            for s in [Series(case), Series(case, dtype='category')]:
-                res_first = s.duplicated(keep='first')
-                tm.assert_series_equal(res_first, Series(exp_first))
+        # series
+        for s in [Series(case), Series(case, dtype='category')]:
+            res_first = s.duplicated(keep='first')
+            tm.assert_series_equal(res_first, Series(exp_first))
 
-                res_last = s.duplicated(keep='last')
-                tm.assert_series_equal(res_last, Series(exp_last))
+            res_last = s.duplicated(keep='last')
+            tm.assert_series_equal(res_last, Series(exp_last))
 
-                res_false = s.duplicated(keep=False)
-                tm.assert_series_equal(res_false, Series(exp_false))
+            res_false = s.duplicated(keep=False)
+            tm.assert_series_equal(res_false, Series(exp_false))
 
     def test_datetime_likes(self):
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index c361b430cfd8a..94f163028ba74 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -142,6 +142,26 @@ def test_constructor_empty(self):
         expected = pd.Int64Index([1, 2, 3])
         tm.assert_index_equal(c.categories, expected)
 
+    def test_constructor_tuples(self):
+        values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
+        result = Categorical(values)
+        expected = Index([(1,), (1, 2)], tupleize_cols=False)
+        tm.assert_index_equal(result.categories, expected)
+        assert result.ordered is False
+
+    def test_constructor_tuples_datetimes(self):
+        # numpy will auto reshape when all of the tuples are the
+        # same len, so add an extra one with 2 items and slice it off
+        values = np.array([(Timestamp('2010-01-01'),),
+                           (Timestamp('2010-01-02'),),
+                           (Timestamp('2010-01-01'),),
+                           (Timestamp('2010-01-02'),),
+                           ('a', 'b')], dtype=object)[:-1]
+        result = Categorical(values)
+        expected = Index([(Timestamp('2010-01-01'),),
+                          (Timestamp('2010-01-02'),)], tupleize_cols=False)
+        tm.assert_index_equal(result.categories, expected)
+
     def test_constructor_unsortable(self):
 
         # it works!
@@ -173,13 +193,13 @@ def test_is_equal_dtype(self):
         assert c1.is_dtype_equal(c1)
         assert c2.is_dtype_equal(c2)
         assert c3.is_dtype_equal(c3)
-        assert not c1.is_dtype_equal(c2)
+        assert c1.is_dtype_equal(c2)
         assert not c1.is_dtype_equal(c3)
         assert not c1.is_dtype_equal(Index(list('aabca')))
         assert not c1.is_dtype_equal(c1.astype(object))
         assert c1.is_dtype_equal(CategoricalIndex(c1))
-        assert not (c1.is_dtype_equal(
-            CategoricalIndex(c1, categories=list('cab'))))
+        assert (c1.is_dtype_equal(
+            CategoricalIndex(c1, categories=list('cab'))))  # XXX: changed
         assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
 
     def test_constructor(self):
@@ -432,6 +452,42 @@ def test_constructor_invariant(self):
             c2 = Categorical(c)
             tm.assert_categorical_equal(c, c2)
 
+    @pytest.mark.parametrize('ordered', [True, False])
+    def test_constructor_with_dtype(self, ordered):
+        categories = ['b', 'a', 'c']
+        dtype = CategoricalDtype(categories, ordered=ordered)
+        result = pd.Categorical(['a', 'b', 'a', 'c'], dtype=dtype)
+        expected = pd.Categorical(['a', 'b', 'a', 'c'], categories=categories,
+                                  ordered=ordered)
+        tm.assert_categorical_equal(result, expected)
+        assert result.ordered is ordered
+
+    def test_constructor_dtype_and_others_raises(self):
+        dtype = CategoricalDtype(['a', 'b'], ordered=True)
+        with tm.assert_raises_regex(ValueError, "Cannot"):
+            Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)
+
+        with tm.assert_raises_regex(ValueError, "Cannot"):
+            Categorical(['a', 'b'], ordered=True, dtype=dtype)
+
+        with tm.assert_raises_regex(ValueError, "Cannot"):
+            Categorical(['a', 'b'], ordered=False, dtype=dtype)
+
+    @pytest.mark.parametrize('categories', [
+        None, ['a', 'b'], ['a', 'c'],
+    ])
+    @pytest.mark.parametrize('ordered', [True, False])
+    def test_constructor_str_category(self, categories, ordered):
+        result = Categorical(['a', 'b'], categories=categories,
+                             ordered=ordered, dtype='category')
+        expected = Categorical(['a', 'b'], categories=categories,
+                               ordered=ordered)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_constructor_str_unknown(self):
+        with tm.assert_raises_regex(ValueError, "Unknown `dtype`"):
+            Categorical([1, 2], dtype="foo")
+
     def test_from_codes(self):
 
         # too few categories
@@ -643,6 +699,11 @@ def test_categories_none(self):
                               'a', 'c', 'c', 'c'], ordered=True)
         tm.assert_categorical_equal(factor, self.factor)
 
+    def test_set_categories_inplace(self):
+        cat = self.factor.copy()
+        cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
+        tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd']))
+
     def test_describe(self):
         # string type
         desc = self.factor.describe()
@@ -853,6 +914,60 @@ def test_ordered_api(self):
         tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a']))
         assert cat4.ordered
 
+    def test_set_dtype_same(self):
+        c = Categorical(['a', 'b', 'c'])
+        result = c._set_dtype(CategoricalDtype(['a', 'b', 'c']))
+        tm.assert_categorical_equal(result, c)
+
+    def test_set_dtype_new_categories(self):
+        c = Categorical(['a', 'b', 'c'])
+        result = c._set_dtype(CategoricalDtype(['a', 'b', 'c', 'd']))
+        tm.assert_numpy_array_equal(result.codes, c.codes)
+        tm.assert_index_equal(result.dtype.categories,
+                              pd.Index(['a', 'b', 'c', 'd']))
+
+    def test_set_dtype_nans(self):
+        c = Categorical(['a', 'b', np.nan])
+        result = c._set_dtype(CategoricalDtype(['a', 'c']))
+        tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
+                                                           dtype='int8'))
+
+    @pytest.mark.parametrize('values, categories, new_categories', [
+        # No NaNs, same cats, same order
+        (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
+        # No NaNs, same cats, different order
+        (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
+        # Same, unsorted
+        (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
+        # No NaNs, same cats, different order
+        (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
+        # NaNs
+        (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
+        (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+        # Introduce NaNs
+        (['a', 'b', 'c'], ['a', 'b'], ['a']),
+        (['a', 'b', 'c'], ['a', 'b'], ['b']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a']),
+        # No overlap
+        (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
+    ])
+    @pytest.mark.parametrize('ordered', [True, False])
+    def test_set_dtype_many(self, values, categories, new_categories,
+                            ordered):
+        c = Categorical(values, categories)
+        expected = Categorical(values, new_categories, ordered)
+        result = c._set_dtype(expected.dtype)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_set_dtype_no_overlap(self):
+        c = Categorical(['a', 'b', 'c'], ['d', 'e'])
+        result = c._set_dtype(CategoricalDtype(['a', 'b']))
+        expected = Categorical([None, None, None], categories=['a', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
     def test_set_ordered(self):
 
         cat = Categorical(["a", "b", "c", "a"], ordered=True)
@@ -1526,7 +1641,7 @@ def test_shift(self):
 
     def test_nbytes(self):
         cat = pd.Categorical([1, 2, 3])
-        exp = cat._codes.nbytes + cat._categories.values.nbytes
+        exp = 3 + 3 * 8  # 3 int8s for values + 3 int64s for categories
         assert cat.nbytes == exp
 
     def test_memory_usage(self):
@@ -1700,6 +1815,13 @@ def test_validate_inplace(self):
             with pytest.raises(ValueError):
                 cat.sort_values(inplace=value)
 
+    @pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
+    def test_imaginary(self):
+        values = [1, 2, 3 + 1j]
+        c1 = pd.Categorical(values)
+        tm.assert_index_equal(c1.categories, pd.Index(values))
+        tm.assert_numpy_array_equal(np.array(c1), np.array(values))
+
 
 class TestCategoricalAsBlock(object):
 
@@ -2132,15 +2254,18 @@ def test_assignment_to_dataframe(self):
 
         result = df.dtypes
         expected = Series(
-            [np.dtype('int32'), CategoricalDtype()], index=['value', 'D'])
+            [np.dtype('int32'), CategoricalDtype(categories=labels,
+                                                 ordered=False)],
+            index=['value', 'D'])
         tm.assert_series_equal(result, expected)
 
         df['E'] = s
         str(df)
 
         result = df.dtypes
-        expected = Series([np.dtype('int32'), CategoricalDtype(),
-                           CategoricalDtype()],
+        expected = Series([np.dtype('int32'),
+                           CategoricalDtype(categories=labels, ordered=False),
+                           CategoricalDtype(categories=labels, ordered=False)],
                           index=['value', 'D', 'E'])
         tm.assert_series_equal(result, expected)
 
@@ -4050,7 +4175,7 @@ def test_categorical_index_preserver(self):
 
         # wrong catgories
         df3 = DataFrame({'A': a,
-                         'B': pd.Categorical(b, categories=list('abc'))
+                         'B': pd.Categorical(b, categories=list('abe'))  # XXX
                          }).set_index('B')
         pytest.raises(TypeError, lambda: pd.concat([df2, df3]))
 
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index 7dac83953ad8f..65f095a1406ca 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -1244,7 +1244,12 @@ def assert_series_equal(left, right, check_dtype=True,
                        obj='{obj}.index'.format(obj=obj))
 
     if check_dtype:
-        assert_attr_equal('dtype', left, right)
+        if (is_categorical_dtype(left) and is_categorical_dtype(right) and
+                not check_categorical):
+            # compat with pandas 0.21.0 CategoricalDtype
+            pass
+        else:
+            assert_attr_equal('dtype', left, right)
 
     if check_exact:
         assert_numpy_array_equal(left.get_values(), right.get_values(),

From ed5c8143e4d9fb88cee9ea6eef55080ac2b74a60 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 19 Sep 2017 16:27:46 -0500
Subject: [PATCH 2/7] update

---
 pandas/core/categorical.py         | 1 +
 pandas/tests/dtypes/test_dtypes.py | 3 +++
 pandas/util/testing.py             | 4 +++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 7e92255ef0419..6ae333618c2ab 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -222,6 +222,7 @@ class Categorical(PandasObject):
     Categorical.order
     Categorical.min
     Categorical.max
+    pandas.api.types.CategoricalDtype
     """
 
     # For comparisons, so that numpy uses our implementation if the compare
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index ce06f013dc85d..0314723745e05 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -615,6 +615,9 @@ def test_invalid_raises(self):
         with tm.assert_raises_regex(TypeError, 'ordered'):
             CategoricalDtype(['a', 'b'], ordered='foo')
 
+        with tm.assert_raises_regex(TypeError, 'collection'):
+            CategoricalDtype('category')
+
     def test_mixed(self):
         a = CategoricalDtype(['a', 'b', 1, 2])
         b = CategoricalDtype(['a', 'b', '1', '2'])
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index 65f095a1406ca..23dba96e74a2e 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -1244,9 +1244,11 @@ def assert_series_equal(left, right, check_dtype=True,
                        obj='{obj}.index'.format(obj=obj))
 
     if check_dtype:
+        # We want to skip exact dtype checking when `check_categorical`
+        # is False. We'll still raise if only one is a `Categorical`,
+        # regardless of `check_categorical`
         if (is_categorical_dtype(left) and is_categorical_dtype(right) and
                 not check_categorical):
-            # compat with pandas 0.21.0 CategoricalDtype
             pass
         else:
             assert_attr_equal('dtype', left, right)

From 416d1d7b9ce81287a1b7f2802751dca1fdeb5084 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 20 Sep 2017 10:47:09 -0500
Subject: [PATCH 3/7] Consistent CategoricalDtype use in Categorical init

Get a valid instance of `CategoricalDtype` as early as possible, and use that
throughout.
---
 pandas/core/categorical.py         | 46 ++++++++++++++++++++----------
 pandas/core/dtypes/dtypes.py       | 13 +++++++++
 pandas/tests/dtypes/test_dtypes.py | 27 ++++++++++++++++++
 pandas/tests/test_categorical.py   | 41 ++++++++++++++++++++++++++
 4 files changed, 112 insertions(+), 15 deletions(-)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 6ae333618c2ab..f5c2d0306a42c 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -234,6 +234,21 @@ class Categorical(PandasObject):
     def __init__(self, values, categories=None, ordered=None, dtype=None,
                  fastpath=False):
 
+        # Ways of specifying the dtype (prioritized ordered)
+        # 1. dtype is a CategoricalDtype
+        #    a.) with known categories, use dtype.categories
+        #    b.) else with Categorical values, use values.dtype
+        #    c.) else, infer from values
+        #    d.) specifying dtype=CategoricalDtype and categories is an error
+        # 2. dtype is a string 'category'
+        #    a.) use categories, ordered
+        #    b.) use values.dtype
+        #    c.) infer from values
+        # 3. dtype is None
+        #    a.) use categories, ordered
+        #    b.) use values.dtype
+        #    c.) infer from values
+
         if dtype is not None:
             if isinstance(dtype, compat.string_types):
                 if dtype == 'category':
@@ -247,12 +262,16 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
             categories = dtype.categories
             ordered = dtype.ordered
 
-        if ordered is None:
-            ordered = False
+        elif is_categorical(values):
+            dtype = values.dtype._from_categorical_dtype(values.dtype,
+                                                         categories, ordered)
+        else:
+            dtype = CategoricalDtype(categories, ordered)
+
+        # At this point, dtype is always a CategoricalDtype
+        # if dtype.categories is None, we are inferring
 
         if fastpath:
-            if dtype is None:
-                dtype = CategoricalDtype(categories, ordered)
             self._codes = coerce_indexer_dtype(values, categories)
             self._dtype = dtype
             return
@@ -260,7 +279,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
         # sanitize input
         if is_categorical_dtype(values):
 
-            # we are either a Series, CategoricalIndex
+            # we are either a Series or a CategoricalIndex
             if isinstance(values, (ABCSeries, ABCCategoricalIndex)):
                 values = values._values
 
@@ -271,6 +290,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
             values = values.get_values()
 
         elif isinstance(values, (ABCIndexClass, ABCSeries)):
+            # we'll do inference later
             pass
 
         else:
@@ -288,12 +308,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
                 # "object" dtype to prevent this. In the end objects will be
                 # casted to int/... in the category assignment step.
                 if len(values) == 0 or isna(values).any():
-                    dtype = 'object'
+                    sanitize_dtype = 'object'
                 else:
-                    dtype = None
-                values = _sanitize_array(values, None, dtype=dtype)
+                    sanitize_dtype = None
+                values = _sanitize_array(values, None, dtype=sanitize_dtype)
 
-        if categories is None:
+        if dtype.categories is None:
             try:
                 codes, categories = factorize(values, sort=True)
             except TypeError:
@@ -310,7 +330,8 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
                 raise NotImplementedError("> 1 ndim Categorical are not "
                                           "supported at this time")
 
-            if dtype is None or isinstance(dtype, str):
+            if dtype.categories is None:
+                # we're inferring from values
                 dtype = CategoricalDtype(categories, ordered)
 
         else:
@@ -321,11 +342,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
             # - the new one, where each value is also in the categories array
             #   (or np.nan)
 
-            # make sure that we always have the same type here, no matter what
-            # we get passed in
-            if dtype is None or isinstance(dtype, str):
-                dtype = CategoricalDtype(categories, ordered)
-
             codes = _get_codes_for_values(values, dtype.categories)
 
             # TODO: check for old style usage. These warnings should be removes
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 8be7870be67f2..d2487905caced 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -160,9 +160,22 @@ def _from_fastpath(cls, categories=None, ordered=False):
         self._finalize(categories, ordered, fastpath=True)
         return self
 
+    @classmethod
+    def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
+        if categories is ordered is None:
+            return dtype
+        if categories is None:
+            categories = dtype.categories
+        if ordered is None:
+            ordered = dtype.ordered
+        return cls(categories, ordered)
+
     def _finalize(self, categories, ordered, fastpath=False):
         from pandas.core.indexes.base import Index
 
+        if ordered is None:
+            ordered = False
+
         if categories is not None:
             categories = Index(categories, tupleize_cols=False)
             # validation
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index 0314723745e05..be3e5fdc467d3 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -622,3 +622,30 @@ def test_mixed(self):
         a = CategoricalDtype(['a', 'b', 1, 2])
         b = CategoricalDtype(['a', 'b', '1', '2'])
         assert hash(a) != hash(b)
+
+    def test_from_categorical_dtype_identity(self):
+        c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+        # Identity test for no changes
+        c2 = CategoricalDtype._from_categorical_dtype(c1)
+        assert c2 is c1
+
+    def test_from_categorical_dtype_categories(self):
+        c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+        # override categories
+        result = CategoricalDtype._from_categorical_dtype(
+            c1, categories=[2, 3])
+        assert result == CategoricalDtype([2, 3], ordered=True)
+
+    def test_from_categorical_dtype_ordered(self):
+        c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+        # override ordered
+        result = CategoricalDtype._from_categorical_dtype(
+            c1, ordered=False)
+        assert result == CategoricalDtype([1, 2, 3], ordered=False)
+
+    def test_from_categorical_dtype_both(self):
+        c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+        # override ordered
+        result = CategoricalDtype._from_categorical_dtype(
+            c1, categories=[1, 2], ordered=False)
+        assert result == CategoricalDtype([1, 2], ordered=False)
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 94f163028ba74..afd9e2dc2df2f 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -488,6 +488,37 @@ def test_constructor_str_unknown(self):
         with tm.assert_raises_regex(ValueError, "Unknown `dtype`"):
             Categorical([1, 2], dtype="foo")
 
+    def test_constructor_from_categorical_with_dtype(self):
+        dtype = CategoricalDtype(['a', 'b', 'c'], ordered=True)
+        values = Categorical(['a', 'b', 'd'])
+        result = Categorical(values, dtype=dtype)
+        # We use dtype.categories, not values.categories
+        expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'],
+                               ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_constructor_from_categorical_with_unknown_dtype(self):
+        dtype = CategoricalDtype(None, ordered=True)
+        values = Categorical(['a', 'b', 'd'])
+        result = Categorical(values, dtype=dtype)
+        # We use values.categories, not dtype.categories
+        expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'd'],
+                               ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_contructor_from_categorical_string(self):
+        values = Categorical(['a', 'b', 'd'])
+        # use categories, ordered
+        result = Categorical(values, categories=['a', 'b', 'c'], ordered=True,
+                             dtype='category')
+        expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'],
+                               ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
+        # No string
+        result = Categorical(values, categories=['a', 'b', 'c'], ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
     def test_from_codes(self):
 
         # too few categories
@@ -932,6 +963,16 @@ def test_set_dtype_nans(self):
         tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
                                                            dtype='int8'))
 
+    def test_set_categories(self):
+        cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
+        result = cat._set_categories(['a', 'b', 'c', 'd', 'e'])
+        expected = Categorical(['a', 'b', 'c'], categories=list('abcde'))
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath
+        result = cat._set_categories(['a', 'b', 'c', 'd', 'e'], fastpath=True)
+        tm.assert_categorical_equal(result, expected)
+
     @pytest.mark.parametrize('values, categories, new_categories', [
         # No NaNs, same cats, same order
         (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),

From e6c05a005181042153c746fe0f62ee549529b631 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 22 Sep 2017 17:30:18 -0500
Subject: [PATCH 4/7] PEP8 fixes

---
 pandas/tests/io/test_parquet.py  | 2 --
 pandas/tests/test_categorical.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 4382958474c9f..af382a05fee45 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -7,8 +7,6 @@
 
 import numpy as np
 import pandas as pd
-from pandas.compat import PY3
-from distutils.version import LooseVersion
 from pandas.compat import PY3, is_platform_windows
 from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
                                PyArrowImpl, FastParquetImpl)
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index afd9e2dc2df2f..d3a62cae685ed 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -963,7 +963,7 @@ def test_set_dtype_nans(self):
         tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
                                                            dtype='int8'))
 
-    def test_set_categories(self):
+    def test_set_categories_private(self):
         cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
         result = cat._set_categories(['a', 'b', 'c', 'd', 'e'])
         expected = Categorical(['a', 'b', 'c'], categories=list('abcde'))

From 41172ce0969a9306f225bc09142db07a0097037e Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 23 Sep 2017 06:08:35 -0500
Subject: [PATCH 5/7] Final doc fixups

---
 doc/source/categorical.rst       | 6 ++++--
 doc/source/whatsnew/v0.21.0.txt  | 3 ++-
 pandas/tests/test_categorical.py | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index 55b5d93e94943..d2b53420764b2 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -207,8 +207,10 @@ All instances of ``CategoricalDtype`` compare equal to the string ``'category'``
 .. warning::
 
    Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``,
-   and since all instances ``CategoricalDtype`` compare equal to ``'`category'``,
-   all instances of ``CategoricalDtype`` compare equal to a ``CategoricalDtype(None)``
+   and since all instances ``CategoricalDtype`` compare equal to ``'category'``,
+   all instances of ``CategoricalDtype`` compare equal to a
+   ``CategoricalDtype(None, False)``, regardless of ``categories`` or
+   ``ordered``.
 
 Description
 -----------
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index e73572c296eac..82b793f7f84c3 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -100,7 +100,8 @@ Setting a list-like data structure into a new attribute now raise a ``UserWarnin
 expanded to include the ``categories`` and ``ordered`` attributes. A
 ``CategoricalDtype`` can be used to specify the set of categories and
 orderedness of an array, independent of the data themselves. This can be useful,
-e.g., when converting string data to a ``Categorical`` (:issue:`14711`, :issue:`15078`):
+e.g., when converting string data to a ``Categorical`` (:issue:`14711`,
+:issue:`15078`, :issue:`16015`):
 
 .. ipython:: python
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index d3a62cae685ed..9b124ba1f276d 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -199,7 +199,7 @@ def test_is_equal_dtype(self):
         assert not c1.is_dtype_equal(c1.astype(object))
         assert c1.is_dtype_equal(CategoricalIndex(c1))
         assert (c1.is_dtype_equal(
-            CategoricalIndex(c1, categories=list('cab'))))  # XXX: changed
+            CategoricalIndex(c1, categories=list('cab'))))
         assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
 
     def test_constructor(self):
@@ -4216,7 +4216,7 @@ def test_categorical_index_preserver(self):
 
         # wrong catgories
         df3 = DataFrame({'A': a,
-                         'B': pd.Categorical(b, categories=list('abe'))  # XXX
+                         'B': pd.Categorical(b, categories=list('abe'))
                          }).set_index('B')
         pytest.raises(TypeError, lambda: pd.concat([df2, df3]))
 

From 141e5094e13fecd68f1fc124e06e0349c29adc2a Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 23 Sep 2017 07:56:56 -0500
Subject: [PATCH 6/7] Fixup set_categories inplace test

---
 pandas/core/categorical.py       | 13 ++++++++++++-
 pandas/tests/test_categorical.py | 12 +++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index f5c2d0306a42c..48180f5e3217a 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -601,13 +601,24 @@ def _get_labels(self):
     labels = property(fget=_get_labels, fset=_set_codes)
 
     def _set_categories(self, categories, fastpath=False):
-        """ Sets new categories
+        """ Sets new categories inplace
 
         Parameters
         ----------
         fastpath : boolean (default: False)
            Don't perform validation of the categories for uniqueness or nulls
 
+        Examples
+        --------
+        >>> c = Categorical(['a', 'b'])
+        >>> c
+        [a, b]
+        Categories (2, object): [a, b]
+
+        >>> c._set_categories(pd.Index(['a', 'c']))
+        >>> c
+        [a, c]
+        Categories (2, object): [a, c]
         """
 
         if fastpath:
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 9b124ba1f276d..71f43d1922085 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -965,13 +965,15 @@ def test_set_dtype_nans(self):
 
     def test_set_categories_private(self):
         cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
-        result = cat._set_categories(['a', 'b', 'c', 'd', 'e'])
-        expected = Categorical(['a', 'b', 'c'], categories=list('abcde'))
-        tm.assert_categorical_equal(result, expected)
+        cat._set_categories(['a', 'c', 'd', 'e'])
+        expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
+        tm.assert_categorical_equal(cat, expected)
 
         # fastpath
-        result = cat._set_categories(['a', 'b', 'c', 'd', 'e'], fastpath=True)
-        tm.assert_categorical_equal(result, expected)
+        cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'])
+        cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True)
+        expected = Categorical(['a', 'c', 'd'], categories=list('acde'))
+        tm.assert_categorical_equal(cat, expected)
 
     @pytest.mark.parametrize('values, categories, new_categories', [
         # No NaNs, same cats, same order

From 43f90cc13786b57b89709cdb7dd8d2c023adaee6 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sat, 23 Sep 2017 11:33:03 -0500
Subject: [PATCH 7/7] PEP8

---
 pandas/tests/io/test_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index af382a05fee45..ecd4e8f719014 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import pandas as pd
-from pandas.compat import PY3, is_platform_windows
+from pandas.compat import PY3
 from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
                                PyArrowImpl, FastParquetImpl)
 from pandas.util import testing as tm