From e83a0b820e1abd2c97685c6ff5a1917f35bd5722 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 31 Aug 2017 13:53:07 -0500
Subject: [PATCH 01/17] ENH: Accept CategoricalDtype in CSV reader

---
 doc/source/io.rst                | 15 ++++++++++-
 pandas/_libs/parsers.pyx         | 23 +++++++++++++---
 pandas/io/parsers.py             |  7 ++++-
 pandas/tests/io/parser/dtypes.py | 45 ++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index d6abed6e9d1ad..6a4af0c716f4d 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -452,7 +452,8 @@ Specifying Categorical dtype
 
 .. versionadded:: 0.19.0
 
-``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
+``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or
+``dtype=CategoricalDtype(categories, ordered)``.
 
 .. ipython:: python
 
@@ -468,6 +469,18 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
+Specifying ``dtype='cateogry'`` will result in a ``Categorical`` that is
+unordered, and whose ``categories`` are the unique values observed in the data.
+For more control on the categories and order, create a
+:class:`~pandas.api.types.CategoricalDtype` ahead of time.
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
+   pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes
+
 .. note::
 
    The resulting categories will always be parsed as strings (object dtype).
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 5bf9f4ce83cbf..9324ca0c76ce3 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1267,6 +1267,8 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         elif is_categorical_dtype(dtype):
+            # TODO: I suspect that this could be optimized when dtype
+            # is an instance of CategoricalDtype
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
@@ -1278,8 +1280,18 @@ cdef class TextReader:
                 indexer = cats.get_indexer(unsorted)
                 codes = take_1d(indexer, codes, fill_value=-1)
 
-            return Categorical(codes, categories=cats, ordered=False,
-                               fastpath=True), na_count
+            cat = Categorical(codes, categories=cats, ordered=False,
+                              fastpath=True)
+
+            if isinstance(dtype, CategoricalDtype):
+                if dtype.categories is None:
+                    # skip recoding
+                    if dtype.ordered:
+                        cat = cat.set_ordered(ordered=dtype.ordered)
+                else:
+                    cat = cat.set_categories(dtype.categories,
+                                             ordered=dtype.ordered)
+            return cat, na_count
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
@@ -2230,8 +2242,11 @@ def _concatenate_chunks(list chunks):
             if common_type == np.object:
                 warning_columns.append(str(name))
 
-        if is_categorical_dtype(dtypes.pop()):
-            result[name] = union_categoricals(arrs, sort_categories=True)
+        dtype = dtypes.pop()
+        if is_categorical_dtype(dtype):
+            sort_categories = isinstance(dtype, str)
+            result[name] = union_categoricals(arrs,
+                                              sort_categories=sort_categories)
         else:
             result[name] = np.concatenate(arrs)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index ed15d4295d688..2a94c50c91f25 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -21,6 +21,7 @@
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
     is_scalar, is_categorical_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
@@ -1607,7 +1608,11 @@ def _cast_types(self, values, cast_type, column):
             # as strings
             if not is_object_dtype(values):
                 values = astype_nansafe(values, str)
-            values = Categorical(values)
+            if isinstance(cast_type, CategoricalDtype):
+                values = Categorical(values, categories=cast_type.categories,
+                                     ordered=cast_type.ordered)
+            else:
+                values = Categorical(values)
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
index 402fa0817595c..10d005fa4a333 100644
--- a/pandas/tests/io/parser/dtypes.py
+++ b/pandas/tests/io/parser/dtypes.py
@@ -149,6 +149,51 @@ def test_categorical_dtype_chunksize(self):
         for actual, expected in zip(actuals, expecteds):
             tm.assert_frame_equal(actual, expected)
 
+    @pytest.mark.parametrize('ordered', [False, True])
+    @pytest.mark.parametrize('categories', [
+        ['a', 'b', 'c'],
+        ['a', 'c', 'b'],
+        ['a', 'b', 'c', 'd'],
+    ])
+    def test_categorical_categoricaldtype(self, categories, ordered):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        expected = pd.DataFrame({
+            "a": [1, 1, 1, 2],
+            "b": Categorical(['a', 'b', 'b', 'c'],
+                             categories=categories,
+                             ordered=ordered)
+        })
+        dtype = {"b": CategoricalDtype(categories=categories,
+                                       ordered=ordered)}
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_categoricaldtype_chunksize(self):
+        # GH 10153
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        cats = ['a', 'b', 'c']
+        expecteds = [pd.DataFrame({'a': [1, 1],
+                                   'b': Categorical(['a', 'b'],
+                                                    categories=cats)}),
+                     pd.DataFrame({'a': [1, 2],
+                                   'b': Categorical(['b', 'c'],
+                                                    categories=cats)},
+                                  index=[2, 3])]
+        dtype = CategoricalDtype(cats)
+        actuals = self.read_csv(StringIO(data), dtype={'b': dtype},
+                                chunksize=2)
+
+        for actual, expected in zip(actuals, expecteds):
+            tm.assert_frame_equal(actual, expected)
+
     def test_empty_pass_dtype(self):
         data = 'one,two'
         result = self.read_csv(StringIO(data), dtype={'one': 'u1'})

From 388e8a963dc8d0e93a8d910bf17ce565048503df Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sun, 24 Sep 2017 13:26:23 -0500
Subject: [PATCH 02/17] rework

---
 pandas/_libs/parsers.pyx | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 9324ca0c76ce3..90bbe07b23d55 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1274,23 +1274,25 @@ cdef class TextReader:
                 na_hashset, self.c_encoding)
             # sort categories and recode if necessary
             cats = Index(cats)
-            if not cats.is_monotonic_increasing:
+            if (isinstance(dtype, CategoricalDtype) and
+                    dtype.categories is not None):
+                # redcode for dtype.categories
+                categories = dtype.categories
+                indexer = cats.get_indexer(categories)
+                codes = take_1d(codes, categories, fill_value=-1)
+            elif not cats.is_monotonic_increasing:
                 unsorted = cats.copy()
                 cats = cats.sort_values()
                 indexer = cats.get_indexer(unsorted)
                 codes = take_1d(indexer, codes, fill_value=-1)
+            else:
+                categories = cats
 
-            cat = Categorical(codes, categories=cats, ordered=False,
+            cat = Categorical(codes, categories=categories, ordered=False,
                               fastpath=True)
 
-            if isinstance(dtype, CategoricalDtype):
-                if dtype.categories is None:
-                    # skip recoding
-                    if dtype.ordered:
-                        cat = cat.set_ordered(ordered=dtype.ordered)
-                else:
-                    cat = cat.set_categories(dtype.categories,
-                                             ordered=dtype.ordered)
+            if isinstance(dtype, CategoricalDtype) and dtype.ordered:
+                cat = cat.set_ordered(ordered=True)
             return cat, na_count
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,

From c5f6e04fdf21de4beede3bf2fb6a12cabc4cd76d Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sun, 24 Sep 2017 15:39:20 -0500
Subject: [PATCH 03/17] Fixed basic implementation

---
 pandas/_libs/parsers.pyx         | 15 ++++++++-------
 pandas/tests/io/parser/dtypes.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 90bbe07b23d55..b2a7d15a3692e 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1272,27 +1272,28 @@ cdef class TextReader:
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
-            # sort categories and recode if necessary
             cats = Index(cats)
             if (isinstance(dtype, CategoricalDtype) and
                     dtype.categories is not None):
-                # redcode for dtype.categories
+                # recode for dtype.categories
                 categories = dtype.categories
-                indexer = cats.get_indexer(categories)
-                codes = take_1d(codes, categories, fill_value=-1)
+                indexer = categories.get_indexer(cats)
+                codes = take_1d(indexer, codes, fill_value=-1)
+                ordered = dtype.ordered
             elif not cats.is_monotonic_increasing:
+                # sort categories and recode if necessary
                 unsorted = cats.copy()
                 cats = cats.sort_values()
                 indexer = cats.get_indexer(unsorted)
                 codes = take_1d(indexer, codes, fill_value=-1)
+                ordered = False
             else:
                 categories = cats
+                ordered = False
 
-            cat = Categorical(codes, categories=categories, ordered=False,
+            cat = Categorical(codes, categories=categories, ordered=ordered,
                               fastpath=True)
 
-            if isinstance(dtype, CategoricalDtype) and dtype.ordered:
-                cat = cat.set_ordered(ordered=True)
             return cat, na_count
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
index 10d005fa4a333..5f6cf2892a254 100644
--- a/pandas/tests/io/parser/dtypes.py
+++ b/pandas/tests/io/parser/dtypes.py
@@ -154,6 +154,7 @@ def test_categorical_dtype_chunksize(self):
         ['a', 'b', 'c'],
         ['a', 'c', 'b'],
         ['a', 'b', 'c', 'd'],
+        ['c', 'b', 'a'],
     ])
     def test_categorical_categoricaldtype(self, categories, ordered):
         data = """a,b
@@ -172,6 +173,35 @@ def test_categorical_categoricaldtype(self, categories, ordered):
         result = self.read_csv(StringIO(data), dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
+    def test_categorical_categoricaldtype_unsorted(self):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        dtype = CategoricalDtype(['c', 'b', 'a'])
+        expected = pd.DataFrame({
+            'a': [1, 1, 1, 2],
+            'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a'])
+        })
+        result = self.read_csv(StringIO(data), dtype={'b': dtype})
+        tm.assert_frame_equal(result, expected)
+
+#     @pytest.mark.parametrize('ordered', [True, False])
+#     def test_categoricaldtype_coerces(self, ordered):
+#         dtype = {'b': CategoricalDtype([10, 11, 12, 13], ordered=ordered)}
+#         data = """a,b
+# 1,10
+# 1,11
+# 1,12
+# 2,13"""
+#         expected = pd.DataFrame({
+#             'a': [1, 1, 1, 2],
+#             'b': Categorical([10, 11, 12, 13], ordered=ordered),
+#         }, columns=['a', 'b'])
+#         result = self.read_csv(StringIO(data), dtype=dtype)
+#         tm.assert_frame_equal(result, expected)
+
     def test_categorical_categoricaldtype_chunksize(self):
         # GH 10153
         data = """a,b

From 4b588cdda8b7fed3d7df084078738a91c9e04be5 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 25 Sep 2017 13:22:05 -0500
Subject: [PATCH 04/17] Added casting

---
 pandas/_libs/parsers.pyx         | 24 ++++++++++++++++++---
 pandas/io/parsers.py             | 21 ++++++++++++++-----
 pandas/tests/io/parser/dtypes.py | 36 +++++++++++++++++++-------------
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index b2a7d15a3692e..c76f77e9d047b 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -48,7 +48,7 @@ from pandas.core.dtypes.common import (
 from pandas.core.categorical import Categorical
 from pandas.core.algorithms import take_1d
 from pandas.core.dtypes.concat import union_categoricals
-from pandas import Index
+from pandas import Index, to_numeric, to_datetime, to_timedelta
 
 import pandas.io.common as com
 
@@ -1267,12 +1267,30 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         elif is_categorical_dtype(dtype):
-            # TODO: I suspect that this could be optimized when dtype
-            # is an instance of CategoricalDtype
+            # TODO: I suspect that _categorical_convert could be
+            # optimized when dtype is an instance of CategoricalDtype
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
             cats = Index(cats)
+
+            # Here is where we'll do the casting...
+            if (isinstance(dtype, CategoricalDtype) and
+                    dtype.categories is not None):
+                if dtype.categories.is_numeric():
+                    # is ignore correct?
+                    cats = to_numeric(cats, errors='ignore')
+                elif dtype.categories.is_all_dates:
+                    # is ignore correct?
+                    if is_datetime64_dtype(dtype.categories):
+                        print("before", cats)
+                        cats = to_datetime(cats, errors='ignore')
+                        print("after", cats)
+                    else:
+                        print("before", cats)
+                        cats = to_timedelta(cats, errors='ignore')
+                        print("after", cats)
+
             if (isinstance(dtype, CategoricalDtype) and
                     dtype.categories is not None):
                 # recode for dtype.categories
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 2a94c50c91f25..286e3e2e72d94 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -12,7 +12,7 @@
 
 import numpy as np
 
-from pandas import compat
+from pandas import compat, to_numeric, to_timedelta
 from pandas.compat import (range, lrange, PY3, StringIO, lzip,
                            zip, string_types, map, u)
 from pandas.core.dtypes.common import (
@@ -20,7 +20,8 @@
     is_list_like, is_integer_dtype,
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
-    is_scalar, is_categorical_dtype)
+    is_scalar, is_categorical_dtype,
+    is_datetime64_dtype, is_timedelta64_dtype)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.cast import astype_nansafe
@@ -1606,11 +1607,21 @@ def _cast_types(self, values, cast_type, column):
             # XXX this is for consistency with
             # c-parser which parses all categories
             # as strings
-            if not is_object_dtype(values):
-                values = astype_nansafe(values, str)
-            if isinstance(cast_type, CategoricalDtype):
+            known_cats = (isinstance(cast_type, CategoricalDtype) and
+                          cast_type.categories is not None)
+            str_values = is_object_dtype(values)
+
+            if known_cats and str_values:
+                if cast_type.categories.is_numeric():
+                    values = to_numeric(values, errors='ignore')
+                elif is_datetime64_dtype(cast_type.categories):
+                    values = tools.to_datetime(values, errors='ignore')
+                elif is_timedelta64_dtype(cast_type.categories):
+                    values = to_timedelta(values, errors='ignore')
                 values = Categorical(values, categories=cast_type.categories,
                                      ordered=cast_type.ordered)
+            elif not is_object_dtype(values):
+                values = astype_nansafe(values, str)
             else:
                 values = Categorical(values)
         else:
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
index 5f6cf2892a254..6b92658f174d1 100644
--- a/pandas/tests/io/parser/dtypes.py
+++ b/pandas/tests/io/parser/dtypes.py
@@ -187,20 +187,28 @@ def test_categorical_categoricaldtype_unsorted(self):
         result = self.read_csv(StringIO(data), dtype={'b': dtype})
         tm.assert_frame_equal(result, expected)
 
-#     @pytest.mark.parametrize('ordered', [True, False])
-#     def test_categoricaldtype_coerces(self, ordered):
-#         dtype = {'b': CategoricalDtype([10, 11, 12, 13], ordered=ordered)}
-#         data = """a,b
-# 1,10
-# 1,11
-# 1,12
-# 2,13"""
-#         expected = pd.DataFrame({
-#             'a': [1, 1, 1, 2],
-#             'b': Categorical([10, 11, 12, 13], ordered=ordered),
-#         }, columns=['a', 'b'])
-#         result = self.read_csv(StringIO(data), dtype=dtype)
-#         tm.assert_frame_equal(result, expected)
+    def test_categoricaldtype_coerces_numeric(self):
+        dtype = {'b': CategoricalDtype([1, 2, 3])}
+        data = "b\n1\n1\n2\n3"
+        expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_datetime(self):
+        dtype = {
+            'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS'))
+        }
+        data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
+        expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_timedelta(self):
+        dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))}
+        data = "b\n1H\n2H\n3H"
+        expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
 
     def test_categorical_categoricaldtype_chunksize(self):
         # GH 10153

From e32d5be89781b6cc4923419a87064778cddd9343 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 25 Sep 2017 13:32:59 -0500
Subject: [PATCH 05/17] Doc and cleanup

---
 doc/source/io.rst               | 20 ++++++++++++--------
 doc/source/whatsnew/v0.21.0.txt |  2 ++
 pandas/_libs/parsers.pyx        |  7 ++-----
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 6a4af0c716f4d..61659a9571140 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -469,10 +469,11 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
-Specifying ``dtype='cateogry'`` will result in a ``Categorical`` that is
-unordered, and whose ``categories`` are the unique values observed in the data.
-For more control on the categories and order, create a
-:class:`~pandas.api.types.CategoricalDtype` ahead of time.
+Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical``
+whose ``categories`` are the unique values observed in the data. For more
+control on the categories and order, create a
+:class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for
+that column's ``dtype``.
 
 .. ipython:: python
 
@@ -483,10 +484,13 @@ For more control on the categories and order, create a
 
 .. note::
 
-   The resulting categories will always be parsed as strings (object dtype).
-   If the categories are numeric they can be converted using the
-   :func:`to_numeric` function, or as appropriate, another converter
-   such as :func:`to_datetime`.
+   With ``dtype='category'``, the resulting categories will always be parsed
+   as strings (object dtype). If the categories are numeric they can be
+   converted using the :func:`to_numeric` function, or as appropriate, another
+   converter such as :func:`to_datetime`.
+
+   When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categoriess`` (
+   all numeric, all datetimes, etc.), the conversion is done automatically.
 
    .. ipython:: python
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 1365901c2ce5e..ab68219a18d8a 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -163,6 +163,8 @@ Other Enhancements
 - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`)
 - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`)
 - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
+- Pass a :class:`~pandas.api.types.CategoricalDtype` to :meth:`read_csv` to parse categorical
+  data as numeric, datetimes, or timedeltas, instead of strings. See :ref:`here <io.categorical>`. (:issue:`17643`)
 
 
 .. _whatsnew_0210.api_breaking:
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index c76f77e9d047b..4f14fc365aa77 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1274,7 +1274,8 @@ cdef class TextReader:
                 na_hashset, self.c_encoding)
             cats = Index(cats)
 
-            # Here is where we'll do the casting...
+            # Determine if we should convert inferred string
+            # categories to a specialized type
             if (isinstance(dtype, CategoricalDtype) and
                     dtype.categories is not None):
                 if dtype.categories.is_numeric():
@@ -1283,13 +1284,9 @@ cdef class TextReader:
                 elif dtype.categories.is_all_dates:
                     # is ignore correct?
                     if is_datetime64_dtype(dtype.categories):
-                        print("before", cats)
                         cats = to_datetime(cats, errors='ignore')
-                        print("after", cats)
                     else:
-                        print("before", cats)
                         cats = to_timedelta(cats, errors='ignore')
-                        print("after", cats)
 
             if (isinstance(dtype, CategoricalDtype) and
                     dtype.categories is not None):

From 508dd1e13f67e06e255d91f708b56a2891fa2184 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 25 Sep 2017 14:42:18 -0500
Subject: [PATCH 06/17] Fixed assignment of categoricals

---
 pandas/_libs/parsers.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 4f14fc365aa77..0c8ca84b6ad7f 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1298,8 +1298,8 @@ cdef class TextReader:
             elif not cats.is_monotonic_increasing:
                 # sort categories and recode if necessary
                 unsorted = cats.copy()
-                cats = cats.sort_values()
-                indexer = cats.get_indexer(unsorted)
+                categories = cats.sort_values()
+                indexer = categories.get_indexer(unsorted)
                 codes = take_1d(indexer, codes, fill_value=-1)
                 ordered = False
             else:

From 6f175a7f727d44cf819252d8979a38c1b19384b7 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 26 Sep 2017 08:51:03 -0500
Subject: [PATCH 07/17] Doc and test unexpected values

---
 doc/source/io.rst                | 10 +++++++++-
 doc/source/whatsnew/v0.21.0.txt  | 29 ++++++++++++++++++++++++++---
 pandas/tests/io/parser/dtypes.py |  8 ++++++++
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 61659a9571140..233f32123da5b 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -482,6 +482,14 @@ that column's ``dtype``.
    dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
    pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes
 
+When using ``dtype=CategoricalDtype``, "unexpected" values outside of
+``dtype.categories`` are treated as missing values.
+
+   dtype = CategoricalDtype(['a', 'b', 'd'])  # No 'c'
+   pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1
+
+This matches the behavior of :meth:`Categorical.set_categories`.
+
 .. note::
 
    With ``dtype='category'``, the resulting categories will always be parsed
@@ -489,7 +497,7 @@ that column's ``dtype``.
    converted using the :func:`to_numeric` function, or as appropriate, another
    converter such as :func:`to_datetime`.
 
-   When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categoriess`` (
+   When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categories`` (
    all numeric, all datetimes, etc.), the conversion is done automatically.
 
    .. ipython:: python
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index ab68219a18d8a..414e8b45ab001 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -119,7 +119,7 @@ expanded to include the ``categories`` and ``ordered`` attributes. A
 ``CategoricalDtype`` can be used to specify the set of categories and
 orderedness of an array, independent of the data themselves. This can be useful,
 e.g., when converting string data to a ``Categorical`` (:issue:`14711`,
-:issue:`15078`, :issue:`16015`):
+:issue:`15078`, :issue:`16015`, :issue:`17643`):
 
 .. ipython:: python
 
@@ -129,8 +129,33 @@ e.g., when converting string data to a ``Categorical`` (:issue:`14711`,
    dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True)
    s.astype(dtype)
 
+One place that deserves special mention is in :meth:`read_csv`. Previously, with
+``dtype={'col': 'category'}``, the returned values and categories would always
+be strings.
+
+.. ipython:: python
+
+   from pandas.compat import StringIO
+
+   data = 'A,B\na,1\nb,2\nc,3'
+   pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories
+
+Notice the "object" dtype.
+
+With a ``CategoricalDtype`` of all numerics, datetimes, or
+timedeltas, we can automatically convert to the correct type
+
+    dtype = {'B': CategoricalDtype([1, 2, 3])}
+    pd.read_csv(StringIO(data), dtype=dtype).B.cat.categories
+
+The values have been correctly interpreted as integers.
+
 The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a
 ``Series`` with categorical type will now return an instance of ``CategoricalDtype``.
+For the most part, this is backwards compatible, though the string repr has changed.
+If you were previously using ``str(s.dtype == 'category')`` to detect categorical data,
+switch to :func:`api.types.is_categorical_dtype`, which is compatible with the old and
+new ``CategoricalDtype``.
 
 See the :ref:`CategoricalDtype docs <categorical.categoricaldtype>` for more.
 
@@ -163,8 +188,6 @@ Other Enhancements
 - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`)
 - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`)
 - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
-- Pass a :class:`~pandas.api.types.CategoricalDtype` to :meth:`read_csv` to parse categorical
-  data as numeric, datetimes, or timedeltas, instead of strings. See :ref:`here <io.categorical>`. (:issue:`17643`)
 
 
 .. _whatsnew_0210.api_breaking:
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
index 6b92658f174d1..096fcf9a52718 100644
--- a/pandas/tests/io/parser/dtypes.py
+++ b/pandas/tests/io/parser/dtypes.py
@@ -210,6 +210,14 @@ def test_categoricaldtype_coerces_timedelta(self):
         result = self.read_csv(StringIO(data), dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
+    def test_categoricaldtype_unexpected_categories(self):
+        dtype = {'b': CategoricalDtype(['a', 'b', 'd', 'e'])}
+        data = "b\nd\na\nc\nd"  # Unexpected c
+        expected = pd.DataFrame({"b": Categorical(list('dacd'),
+                                                  dtype=dtype['b'])})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
     def test_categorical_categoricaldtype_chunksize(self):
         # GH 10153
         data = """a,b

From 1545734ed1b258b7ef08e6788e460bf39a37f650 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 26 Sep 2017 10:25:56 -0500
Subject: [PATCH 08/17] DOC: fixups

---
 doc/source/io.rst               |  2 ++
 doc/source/whatsnew/v0.21.0.txt | 14 +++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 233f32123da5b..58e854d16afa2 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -485,6 +485,8 @@ that column's ``dtype``.
 When using ``dtype=CategoricalDtype``, "unexpected" values outside of
 ``dtype.categories`` are treated as missing values.
 
+.. ipython:: python
+
    dtype = CategoricalDtype(['a', 'b', 'd'])  # No 'c'
    pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 414e8b45ab001..731645f992e6a 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -134,9 +134,12 @@ One place that deserves special mention is in :meth:`read_csv`. Previously, with
 be strings.
 
 .. ipython:: python
+   :suppress:
 
    from pandas.compat import StringIO
 
+.. ipython:: python
+
    data = 'A,B\na,1\nb,2\nc,3'
    pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories
 
@@ -151,11 +154,12 @@ timedeltas, we can automatically convert to the correct type
 The values have been correctly interpreted as integers.
 
 The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a
-``Series`` with categorical type will now return an instance of ``CategoricalDtype``.
-For the most part, this is backwards compatible, though the string repr has changed.
-If you were previously using ``str(s.dtype == 'category')`` to detect categorical data,
-switch to :func:`api.types.is_categorical_dtype`, which is compatible with the old and
-new ``CategoricalDtype``.
+``Series`` with categorical type will now return an instance of
+``CategoricalDtype``. For the most part, this is backwards compatible, though
+the string repr has changed. If you were previously using ``str(s.dtype) ==
+'category'`` to detect categorical data, switch to
+:func:`pandas.api.types.is_categorical_dtype`, which is compatible with the old
+and new ``CategoricalDtype``.
 
 See the :ref:`CategoricalDtype docs <categorical.categoricaldtype>` for more.
 

From b80cff8c2f83559298f723bd092255538b753ba9 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 26 Sep 2017 14:17:45 -0500
Subject: [PATCH 09/17] More coercion, use _recode_for_categories

---
 pandas/_libs/parsers.pyx         | 5 ++---
 pandas/tests/io/parser/dtypes.py | 8 ++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 0c8ca84b6ad7f..9db225a6287db 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -45,7 +45,7 @@ from pandas.core.dtypes.common import (
     is_bool_dtype, is_object_dtype,
     is_string_dtype, is_datetime64_dtype,
     pandas_dtype)
-from pandas.core.categorical import Categorical
+from pandas.core.categorical import Categorical, _recode_for_categories
 from pandas.core.algorithms import take_1d
 from pandas.core.dtypes.concat import union_categoricals
 from pandas import Index, to_numeric, to_datetime, to_timedelta
@@ -1292,8 +1292,7 @@ cdef class TextReader:
                     dtype.categories is not None):
                 # recode for dtype.categories
                 categories = dtype.categories
-                indexer = categories.get_indexer(cats)
-                codes = take_1d(indexer, codes, fill_value=-1)
+                codes = _recode_for_categories(codes, cats, categories)
                 ordered = dtype.ordered
             elif not cats.is_monotonic_increasing:
                 # sort categories and recode if necessary
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
index 096fcf9a52718..7d3df6201a390 100644
--- a/pandas/tests/io/parser/dtypes.py
+++ b/pandas/tests/io/parser/dtypes.py
@@ -203,6 +203,14 @@ def test_categoricaldtype_coerces_datetime(self):
         result = self.read_csv(StringIO(data), dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
+        dtype = {
+            'b': CategoricalDtype([pd.Timestamp("2014")])
+        }
+        data = "b\n2014-01-01\n2014-01-01T00:00:00"
+        expected = pd.DataFrame({'b': Categorical([pd.Timestamp('2014')] * 2)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
     def test_categoricaldtype_coerces_timedelta(self):
         dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))}
         data = "b\n1H\n2H\n3H"

From b02882714601004fe8a9480d04461574d550dde5 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 26 Sep 2017 15:50:31 -0500
Subject: [PATCH 10/17] Refactor with maybe_convert_for_categorical

---
 doc/source/io.rst                |  2 ++
 pandas/_libs/parsers.pyx         | 20 +++------------
 pandas/core/dtypes/cast.py       | 38 ++++++++++++++++++++++++++++-
 pandas/io/parsers.py             | 29 +++++++++-------------
 pandas/tests/dtypes/test_cast.py | 42 +++++++++++++++++++++++++++++++-
 5 files changed, 96 insertions(+), 35 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 58e854d16afa2..4d47d8b77aebf 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -469,6 +469,8 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
+.. versionadded:: 0.21.0
+
 Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical``
 whose ``categories`` are the unique values observed in the data. For more
 control on the categories and order, create a
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 9db225a6287db..9587baa3f10f7 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -48,7 +48,8 @@ from pandas.core.dtypes.common import (
 from pandas.core.categorical import Categorical, _recode_for_categories
 from pandas.core.algorithms import take_1d
 from pandas.core.dtypes.concat import union_categoricals
-from pandas import Index, to_numeric, to_datetime, to_timedelta
+from pandas.core.dtypes.cast import maybe_convert_for_categorical
+from pandas import Index
 
 import pandas.io.common as com
 
@@ -1274,19 +1275,7 @@ cdef class TextReader:
                 na_hashset, self.c_encoding)
             cats = Index(cats)
 
-            # Determine if we should convert inferred string
-            # categories to a specialized type
-            if (isinstance(dtype, CategoricalDtype) and
-                    dtype.categories is not None):
-                if dtype.categories.is_numeric():
-                    # is ignore correct?
-                    cats = to_numeric(cats, errors='ignore')
-                elif dtype.categories.is_all_dates:
-                    # is ignore correct?
-                    if is_datetime64_dtype(dtype.categories):
-                        cats = to_datetime(cats, errors='ignore')
-                    else:
-                        cats = to_timedelta(cats, errors='ignore')
+            cats = maybe_convert_for_categorical(cats, dtype)
 
             if (isinstance(dtype, CategoricalDtype) and
                     dtype.categories is not None):
@@ -1298,8 +1287,7 @@ cdef class TextReader:
                 # sort categories and recode if necessary
                 unsorted = cats.copy()
                 categories = cats.sort_values()
-                indexer = categories.get_indexer(unsorted)
-                codes = take_1d(indexer, codes, fill_value=-1)
+                codes = _recode_for_categories(codes, unsorted, categories)
                 ordered = False
             else:
                 categories = cats
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index c2cf6afc1a7b5..acc389bcdaa11 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -24,7 +24,8 @@
                      _ensure_int32, _ensure_int64,
                      _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE,
                      _POSSIBLY_CAST_DTYPES)
-from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype
+from .dtypes import (ExtensionDtype, DatetimeTZDtype, PeriodDtype,
+                     CategoricalDtype)
 from .generic import (ABCDatetimeIndex, ABCPeriodIndex,
                       ABCSeries)
 from .missing import isna, notna
@@ -604,6 +605,41 @@ def conv(r, dtype):
     return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
 
 
+def maybe_convert_for_categorical(categories, dtype):
+    """Convert ``categories`` depending on ``dtype``.
+
+    Converts to numeric, datetime, or timedelta types, when ``dtype`` is
+    a CategoricalDtype with known, non-object categories.
+
+    Parameters
+    ----------
+    categories : array-like
+    type : CategoricalDtype
+
+    Returns
+    -------
+    new_categories : array or Index
+
+    Examples
+    --------
+    >>> maybe_convert_for_categorical(['1', '2'], CategoricalDtype([1, 2]))
+    array([  1,  2])
+    >>> maybe_convert_for_categorical([1, 'a'], CategoricalDtype([1, 2]))
+    array([  1.,  nan])
+    """
+    if isinstance(dtype, CategoricalDtype) and dtype.categories is not None:
+        from pandas import to_numeric, to_datetime, to_timedelta
+
+        if dtype.categories.is_numeric():
+            categories = to_numeric(categories, errors='coerce')
+        elif is_datetime64_dtype(dtype.categories):
+            categories = to_datetime(categories, errors='coerce')
+        elif is_timedelta64_dtype(dtype.categories):
+            categories = to_timedelta(categories, errors='coerce')
+
+    return categories
+
+
 def astype_nansafe(arr, dtype, copy=True):
     """ return a view if copy is False, but
         need to be very careful as the result shape could change! """
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 286e3e2e72d94..1ac82d1e83f7c 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -12,7 +12,7 @@
 
 import numpy as np
 
-from pandas import compat, to_numeric, to_timedelta
+from pandas import compat
 from pandas.compat import (range, lrange, PY3, StringIO, lzip,
                            zip, string_types, map, u)
 from pandas.core.dtypes.common import (
@@ -20,11 +20,11 @@
     is_list_like, is_integer_dtype,
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
-    is_scalar, is_categorical_dtype,
-    is_datetime64_dtype, is_timedelta64_dtype)
+    is_scalar, is_categorical_dtype)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
-from pandas.core.dtypes.cast import astype_nansafe
+from pandas.core.dtypes.cast import (astype_nansafe,
+                                     maybe_convert_for_categorical)
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
                                _ensure_index_from_sequences)
 from pandas.core.series import Series
@@ -1609,21 +1609,16 @@ def _cast_types(self, values, cast_type, column):
             # as strings
             known_cats = (isinstance(cast_type, CategoricalDtype) and
                           cast_type.categories is not None)
-            str_values = is_object_dtype(values)
-
-            if known_cats and str_values:
-                if cast_type.categories.is_numeric():
-                    values = to_numeric(values, errors='ignore')
-                elif is_datetime64_dtype(cast_type.categories):
-                    values = tools.to_datetime(values, errors='ignore')
-                elif is_timedelta64_dtype(cast_type.categories):
-                    values = to_timedelta(values, errors='ignore')
-                values = Categorical(values, categories=cast_type.categories,
-                                     ordered=cast_type.ordered)
+
+            categories = ordered = None
+            if known_cats:
+                values = maybe_convert_for_categorical(values, cast_type)
+                categories = cast_type.categories
+                ordered = cast_type.ordered
             elif not is_object_dtype(values):
                 values = astype_nansafe(values, str)
-            else:
-                values = Categorical(values)
+            values = Categorical(values, categories=categories,
+                                 ordered=ordered)
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)
diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
index d9fb458c83529..f1fe7d720af3b 100644
--- a/pandas/tests/dtypes/test_cast.py
+++ b/pandas/tests/dtypes/test_cast.py
@@ -16,6 +16,7 @@
 from pandas.core.dtypes.cast import (
     maybe_downcast_to_dtype,
     maybe_convert_objects,
+    maybe_convert_for_categorical,
     cast_scalar_to_array,
     infer_dtype_from_scalar,
     infer_dtype_from_array,
@@ -25,7 +26,8 @@
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
     DatetimeTZDtype,
-    PeriodDtype)
+    PeriodDtype,
+    CategoricalDtype)
 from pandas.core.dtypes.common import (
     is_dtype_equal)
 from pandas.util import testing as tm
@@ -299,6 +301,44 @@ def test_maybe_infer_to_datetimelike(self):
                                      [NaT, 'b', 1]]))
         assert result.size == 6
 
+    def test_maybe_convert_for_categorical_noop(self):
+        expected = ['1', '2']
+        result = maybe_convert_for_categorical(expected, None)
+        assert result == expected
+
+        result = maybe_convert_for_categorical(expected, CategoricalDtype())
+        assert result == expected
+
+        result = maybe_convert_for_categorical(expected, 'category')
+        assert result == expected
+
+    @pytest.mark.parametrize('categories, dtype, expected', [
+        (['1', '2'], [1, 2, 3], np.array([1, 2])),
+        (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan])),
+    ])
+    def test_maybe_convert_for_categorical(self, categories, dtype, expected):
+        dtype = CategoricalDtype(dtype)
+        result = maybe_convert_for_categorical(categories, dtype)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize('categories, dtype, expected', [
+        (['2016', '2017'], pd.to_datetime(['2016', '2017']),
+         pd.to_datetime(['2016', '2017'])),
+        (['2016', '2017', 'bad'], pd.to_datetime(['2016', '2017']),
+         pd.to_datetime(['2016', '2017', 'NaT'])),
+
+        (['1H', '2H'], pd.to_timedelta(['1H', '2H']),
+         pd.to_timedelta(['1H', '2H'])),
+        (['1H', '2H', 'bad'], pd.to_timedelta(['1H', '2H']),
+         pd.to_timedelta(['1H', '2H', 'NaT'])),
+
+    ])
+    def test_maybe_convert_for_categorical_dates(self, categories, dtype,
+                                                 expected):
+        dtype = CategoricalDtype(dtype)
+        result = maybe_convert_for_categorical(categories, dtype)
+        tm.assert_index_equal(result, expected)
+
 
 class TestConvert(object):
 

From fc34080d91160f1fc9de835cec97b858da467002 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 27 Sep 2017 09:56:56 -0500
Subject: [PATCH 11/17] PEP8

---
 pandas/tests/dtypes/test_cast.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
index f1fe7d720af3b..b8cab280f3aea 100644
--- a/pandas/tests/dtypes/test_cast.py
+++ b/pandas/tests/dtypes/test_cast.py
@@ -26,8 +26,7 @@
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
     DatetimeTZDtype,
-    PeriodDtype,
-    CategoricalDtype)
+    PeriodDtype)
 from pandas.core.dtypes.common import (
     is_dtype_equal)
 from pandas.util import testing as tm

From d100f0c3bb24b94525466c59b408a2ee94cf898b Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 27 Sep 2017 09:58:35 -0500
Subject: [PATCH 12/17] Type for 32bit

---
 pandas/tests/dtypes/test_cast.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
index b8cab280f3aea..96cb6dd8550c5 100644
--- a/pandas/tests/dtypes/test_cast.py
+++ b/pandas/tests/dtypes/test_cast.py
@@ -312,8 +312,8 @@ def test_maybe_convert_for_categorical_noop(self):
         assert result == expected
 
     @pytest.mark.parametrize('categories, dtype, expected', [
-        (['1', '2'], [1, 2, 3], np.array([1, 2])),
-        (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan])),
+        (['1', '2'], [1, 2, 3], np.array([1, 2], dtype='i8')),
+        (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan], dtype='f8')),
     ])
     def test_maybe_convert_for_categorical(self, categories, dtype, expected):
         dtype = CategoricalDtype(dtype)

From 8600c505b2ec922b5fae6c776d6cebe8b40f0419 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 28 Sep 2017 07:37:43 -0500
Subject: [PATCH 13/17] REF: refactor to new method

---
 pandas/_libs/parsers.pyx         | 25 ++-----------------
 pandas/core/categorical.py       | 43 ++++++++++++++++++++++++++++++++
 pandas/tests/test_categorical.py | 34 +++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 9587baa3f10f7..04f3419f87b85 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1273,30 +1273,9 @@ cdef class TextReader:
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
-            cats = Index(cats)
-
-            cats = maybe_convert_for_categorical(cats, dtype)
-
-            if (isinstance(dtype, CategoricalDtype) and
-                    dtype.categories is not None):
-                # recode for dtype.categories
-                categories = dtype.categories
-                codes = _recode_for_categories(codes, cats, categories)
-                ordered = dtype.ordered
-            elif not cats.is_monotonic_increasing:
-                # sort categories and recode if necessary
-                unsorted = cats.copy()
-                categories = cats.sort_values()
-                codes = _recode_for_categories(codes, unsorted, categories)
-                ordered = False
-            else:
-                categories = cats
-                ordered = False
-
-            cat = Categorical(codes, categories=categories, ordered=ordered,
-                              fastpath=True)
-
+            cat = Categorical._from_inferred_categories(cats, codes, dtype)
             return cat, na_count
+
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 8b055e9ae59c3..5b9514a923a06 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -509,6 +509,49 @@ def base(self):
         """ compat, we are always our own object """
         return None
 
+    @classmethod
+    def _from_inferred_categories(cls, inferred_categories, inferred_codes,
+                                  dtype):
+        """Construct a Categorical from inferred values
+
+        For inferred categories (`dtype` is None) the categories are sorted.
+        For explicit `dtype`, the `inferred_categories` are cast to the
+        appropriate type.
+
+        Parameters
+        ----------
+
+        inferred_categories, inferred_codes : Index
+        dtype : CategoricalDtype
+
+        Returns
+        -------
+        Categorical
+        """
+        from pandas.core.dtypes.cast import maybe_convert_for_categorical
+        from pandas import Index
+
+        cats = Index(inferred_categories)
+        cats = maybe_convert_for_categorical(cats, dtype)
+
+        if (isinstance(dtype, CategoricalDtype) and
+                dtype.categories is not None):
+            # recode for dtype.categories
+            categories = dtype.categories
+            codes = _recode_for_categories(inferred_codes, cats, categories)
+        elif not cats.is_monotonic_increasing:
+            # sort categories and recode if necessary
+            unsorted = cats.copy()
+            categories = cats.sort_values()
+            codes = _recode_for_categories(inferred_codes, unsorted,
+                                           categories)
+            dtype = CategoricalDtype(categories, ordered=False)
+        else:
+            dtype = CategoricalDtype(cats, ordered=False)
+            codes = inferred_codes
+
+        return cls(codes, dtype=dtype, fastpath=True)
+
     @classmethod
     def from_array(cls, data, **kwargs):
         """
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index d43901ea091b7..6130ca44ad0cd 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -560,6 +560,40 @@ def f():
             codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
             pd.Categorical.from_codes(codes, categories=["train", "test"])
 
+    @pytest.mark.parametrize('dtype', [None, 'category'])
+    def test_from_inferred_categories(self, dtype):
+        cats = ['a', 'b']
+        codes = [0, 0, 1, 1]
+        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        expected = Categorical.from_codes(codes, cats)
+        tm.assert_categorical_equal(result, expected)
+
+    @pytest.mark.parametrize('dtype', [None, 'category'])
+    def test_from_inferred_categories_sorts(self, dtype):
+        cats = ['b', 'a']
+        codes = [0, 1, 1, 1]
+        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_from_inferred_categories_dtype(self):
+        cats = ['a', 'b', 'd']
+        codes = [0, 1, 0, 2]
+        dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
+        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        expected = Categorical(['a', 'b', 'a', 'd'],
+                               categories=['c', 'b', 'a'],
+                               ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_from_inferred_categories_coerces(self):
+        cats = ['1', '2', 'bad']
+        codes = [0, 0, 1, 2]
+        dtype = CategoricalDtype([1, 2])
+        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        expected = Categorical([1, 1, 2, np.nan])
+        tm.assert_categorical_equal(result, expected)
+
     def test_validate_ordered(self):
         # see gh-14058
         exp_msg = "'ordered' must either be 'True' or 'False'"

From 96d51441c8fe587498d7ec087f49dc389114facb Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 28 Sep 2017 10:33:57 -0500
Subject: [PATCH 14/17] py2 compat

---
 pandas/tests/test_categorical.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 6130ca44ad0cd..9e3bd40dc275a 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -563,7 +563,7 @@ def f():
     @pytest.mark.parametrize('dtype', [None, 'category'])
     def test_from_inferred_categories(self, dtype):
         cats = ['a', 'b']
-        codes = [0, 0, 1, 1]
+        codes = np.array([0, 0, 1, 1], dtype='i8')
         result = Categorical._from_inferred_categories(cats, codes, dtype)
         expected = Categorical.from_codes(codes, cats)
         tm.assert_categorical_equal(result, expected)
@@ -571,14 +571,14 @@ def test_from_inferred_categories(self, dtype):
     @pytest.mark.parametrize('dtype', [None, 'category'])
     def test_from_inferred_categories_sorts(self, dtype):
         cats = ['b', 'a']
-        codes = [0, 1, 1, 1]
+        codes = np.array([0, 1, 1, 1], dtype='i8')
         result = Categorical._from_inferred_categories(cats, codes, dtype)
         expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
         tm.assert_categorical_equal(result, expected)
 
     def test_from_inferred_categories_dtype(self):
         cats = ['a', 'b', 'd']
-        codes = [0, 1, 0, 2]
+        codes = np.array([0, 1, 0, 2], dtype='i8')
         dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
         result = Categorical._from_inferred_categories(cats, codes, dtype)
         expected = Categorical(['a', 'b', 'a', 'd'],
@@ -588,7 +588,7 @@ def test_from_inferred_categories_dtype(self):
 
     def test_from_inferred_categories_coerces(self):
         cats = ['1', '2', 'bad']
-        codes = [0, 0, 1, 2]
+        codes = np.array([0, 0, 1, 2], dtype='i8')
         dtype = CategoricalDtype([1, 2])
         result = Categorical._from_inferred_categories(cats, codes, dtype)
         expected = Categorical([1, 1, 2, np.nan])

From 3de75cdb60d3cd05a878450a92471692b7b0e0b6 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 29 Sep 2017 06:24:43 -0500
Subject: [PATCH 15/17] Refactored

---
 pandas/_libs/parsers.pyx         |  1 -
 pandas/core/categorical.py       | 17 +++++++++++---
 pandas/core/dtypes/cast.py       | 38 +------------------------------
 pandas/io/parsers.py             | 19 ++++++++--------
 pandas/tests/dtypes/test_cast.py | 39 --------------------------------
 5 files changed, 24 insertions(+), 90 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 04f3419f87b85..60a646769dd1a 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -48,7 +48,6 @@ from pandas.core.dtypes.common import (
 from pandas.core.categorical import Categorical, _recode_for_categories
 from pandas.core.algorithms import take_1d
 from pandas.core.dtypes.concat import union_categoricals
-from pandas.core.dtypes.cast import maybe_convert_for_categorical
 from pandas import Index
 
 import pandas.io.common as com
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index d173298656e3f..d5a161874891f 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -21,6 +21,8 @@
     _ensure_platform_int,
     is_dtype_equal,
     is_datetimelike,
+    is_datetime64_dtype,
+    is_timedelta64_dtype,
     is_categorical,
     is_categorical_dtype,
     is_integer_dtype,
@@ -528,11 +530,20 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
         -------
         Categorical
         """
-        from pandas.core.dtypes.cast import maybe_convert_for_categorical
-        from pandas import Index
+        from pandas import Index, to_numeric, to_datetime, to_timedelta
 
         cats = Index(inferred_categories)
-        cats = maybe_convert_for_categorical(cats, dtype)
+
+        # Convert to a specialzed type with `dtype` is specified
+        if (isinstance(dtype, CategoricalDtype) and
+                dtype.categories is not None):
+
+            if dtype.categories.is_numeric():
+                cats = to_numeric(inferred_categories, errors='coerce')
+            elif is_datetime64_dtype(dtype.categories):
+                cats = to_datetime(inferred_categories, errors='coerce')
+            elif is_timedelta64_dtype(dtype.categories):
+                cats = to_timedelta(inferred_categories, errors='coerce')
 
         if (isinstance(dtype, CategoricalDtype) and
                 dtype.categories is not None):
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index acc389bcdaa11..c2cf6afc1a7b5 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -24,8 +24,7 @@
                      _ensure_int32, _ensure_int64,
                      _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE,
                      _POSSIBLY_CAST_DTYPES)
-from .dtypes import (ExtensionDtype, DatetimeTZDtype, PeriodDtype,
-                     CategoricalDtype)
+from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype
 from .generic import (ABCDatetimeIndex, ABCPeriodIndex,
                       ABCSeries)
 from .missing import isna, notna
@@ -605,41 +604,6 @@ def conv(r, dtype):
     return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
 
 
-def maybe_convert_for_categorical(categories, dtype):
-    """Convert ``categories`` depending on ``dtype``.
-
-    Converts to numeric, datetime, or timedelta types, when ``dtype`` is
-    a CategoricalDtype with known, non-object categories.
-
-    Parameters
-    ----------
-    categories : array-like
-    type : CategoricalDtype
-
-    Returns
-    -------
-    new_categories : array or Index
-
-    Examples
-    --------
-    >>> maybe_convert_for_categorical(['1', '2'], CategoricalDtype([1, 2]))
-    array([  1,  2])
-    >>> maybe_convert_for_categorical([1, 'a'], CategoricalDtype([1, 2]))
-    array([  1.,  nan])
-    """
-    if isinstance(dtype, CategoricalDtype) and dtype.categories is not None:
-        from pandas import to_numeric, to_datetime, to_timedelta
-
-        if dtype.categories.is_numeric():
-            categories = to_numeric(categories, errors='coerce')
-        elif is_datetime64_dtype(dtype.categories):
-            categories = to_datetime(categories, errors='coerce')
-        elif is_timedelta64_dtype(dtype.categories):
-            categories = to_timedelta(categories, errors='coerce')
-
-    return categories
-
-
 def astype_nansafe(arr, dtype, copy=True):
     """ return a view if copy is False, but
         need to be very careful as the result shape could change! """
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f12d8f7c81ab5..42f51407f4e34 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -23,8 +23,7 @@
     is_scalar, is_categorical_dtype)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
-from pandas.core.dtypes.cast import (astype_nansafe,
-                                     maybe_convert_for_categorical)
+from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
                                _ensure_index_from_sequences)
 from pandas.core.series import Series
@@ -1610,15 +1609,15 @@ def _cast_types(self, values, cast_type, column):
             known_cats = (isinstance(cast_type, CategoricalDtype) and
                           cast_type.categories is not None)
 
-            categories = ordered = None
             if known_cats:
-                values = maybe_convert_for_categorical(values, cast_type)
-                categories = cast_type.categories
-                ordered = cast_type.ordered
-            elif not is_object_dtype(values):
-                values = astype_nansafe(values, str)
-            values = Categorical(values, categories=categories,
-                                 ordered=ordered)
+                cats = Index(values).unique()
+                values = Categorical._from_inferred_categories(
+                    cats, cats.get_indexer(values), cast_type
+                )
+            else:
+                if not is_object_dtype(values):
+                    values = astype_nansafe(values, str)
+                values = Categorical(values, categories=None, ordered=False)
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)
diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
index 96cb6dd8550c5..d9fb458c83529 100644
--- a/pandas/tests/dtypes/test_cast.py
+++ b/pandas/tests/dtypes/test_cast.py
@@ -16,7 +16,6 @@
 from pandas.core.dtypes.cast import (
     maybe_downcast_to_dtype,
     maybe_convert_objects,
-    maybe_convert_for_categorical,
     cast_scalar_to_array,
     infer_dtype_from_scalar,
     infer_dtype_from_array,
@@ -300,44 +299,6 @@ def test_maybe_infer_to_datetimelike(self):
                                      [NaT, 'b', 1]]))
         assert result.size == 6
 
-    def test_maybe_convert_for_categorical_noop(self):
-        expected = ['1', '2']
-        result = maybe_convert_for_categorical(expected, None)
-        assert result == expected
-
-        result = maybe_convert_for_categorical(expected, CategoricalDtype())
-        assert result == expected
-
-        result = maybe_convert_for_categorical(expected, 'category')
-        assert result == expected
-
-    @pytest.mark.parametrize('categories, dtype, expected', [
-        (['1', '2'], [1, 2, 3], np.array([1, 2], dtype='i8')),
-        (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan], dtype='f8')),
-    ])
-    def test_maybe_convert_for_categorical(self, categories, dtype, expected):
-        dtype = CategoricalDtype(dtype)
-        result = maybe_convert_for_categorical(categories, dtype)
-        tm.assert_numpy_array_equal(result, expected)
-
-    @pytest.mark.parametrize('categories, dtype, expected', [
-        (['2016', '2017'], pd.to_datetime(['2016', '2017']),
-         pd.to_datetime(['2016', '2017'])),
-        (['2016', '2017', 'bad'], pd.to_datetime(['2016', '2017']),
-         pd.to_datetime(['2016', '2017', 'NaT'])),
-
-        (['1H', '2H'], pd.to_timedelta(['1H', '2H']),
-         pd.to_timedelta(['1H', '2H'])),
-        (['1H', '2H', 'bad'], pd.to_timedelta(['1H', '2H']),
-         pd.to_timedelta(['1H', '2H', 'NaT'])),
-
-    ])
-    def test_maybe_convert_for_categorical_dates(self, categories, dtype,
-                                                 expected):
-        dtype = CategoricalDtype(dtype)
-        result = maybe_convert_for_categorical(categories, dtype)
-        tm.assert_index_equal(result, expected)
-
 
 class TestConvert(object):
 

From f03798dd242852f67c50d61c7a90567be0159ae7 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 29 Sep 2017 10:18:12 -0500
Subject: [PATCH 16/17] More in Categorical

---
 pandas/io/parsers.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 42f51407f4e34..c8b2987d591ef 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1603,21 +1603,20 @@ def _cast_types(self, values, cast_type, column):
         """
 
         if is_categorical_dtype(cast_type):
-            # XXX this is for consistency with
-            # c-parser which parses all categories
-            # as strings
             known_cats = (isinstance(cast_type, CategoricalDtype) and
                           cast_type.categories is not None)
 
-            if known_cats:
-                cats = Index(values).unique()
-                values = Categorical._from_inferred_categories(
-                    cats, cats.get_indexer(values), cast_type
-                )
-            else:
-                if not is_object_dtype(values):
-                    values = astype_nansafe(values, str)
-                values = Categorical(values, categories=None, ordered=False)
+            if not is_object_dtype(values) and not known_cats:
+                # XXX this is for consistency with
+                # c-parser which parses all categories
+                # as strings
+                values = astype_nansafe(values, str)
+
+            cats = Index(values).unique().dropna()
+            values = Categorical._from_inferred_categories(
+                cats, cats.get_indexer(values), cast_type
+            )
+
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)

From 9325a9345f3a4495ff81eb692121c194d8d7c040 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 2 Oct 2017 06:44:59 -0500
Subject: [PATCH 17/17] fixup! More in Categorical

---
 pandas/core/categorical.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index d5a161874891f..ce71e6fd74326 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -523,8 +523,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
         Parameters
         ----------
 
-        inferred_categories, inferred_codes : Index
-        dtype : CategoricalDtype
+        inferred_categories : Index
+        inferred_codes : Index
+        dtype : CategoricalDtype or 'category'
 
         Returns
         -------
@@ -534,10 +535,11 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
 
         cats = Index(inferred_categories)
 
-        # Convert to a specialzed type with `dtype` is specified
-        if (isinstance(dtype, CategoricalDtype) and
-                dtype.categories is not None):
+        known_categories = (isinstance(dtype, CategoricalDtype) and
+                            dtype.categories is not None)
 
+        if known_categories:
+            # Convert to a specialzed type with `dtype` if specified
             if dtype.categories.is_numeric():
                 cats = to_numeric(inferred_categories, errors='coerce')
             elif is_datetime64_dtype(dtype.categories):
@@ -545,13 +547,12 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
             elif is_timedelta64_dtype(dtype.categories):
                 cats = to_timedelta(inferred_categories, errors='coerce')
 
-        if (isinstance(dtype, CategoricalDtype) and
-                dtype.categories is not None):
-            # recode for dtype.categories
+        if known_categories:
+            # recode from observation oder to dtype.categories order
             categories = dtype.categories
             codes = _recode_for_categories(inferred_codes, cats, categories)
         elif not cats.is_monotonic_increasing:
-            # sort categories and recode if necessary
+            # sort categories and recode for unknown categories
             unsorted = cats.copy()
             categories = cats.sort_values()
             codes = _recode_for_categories(inferred_codes, unsorted,