pandas-dev · TomAugspurger · Oct 2, 2017 · Aug 31, 2017 · Sep 24, 2017 · Sep 24, 2017
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -452,7 +452,8 @@ Specifying Categorical dtype
 
 .. versionadded:: 0.19.0
 
-``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
+``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or
+``dtype=CategoricalDtype(categories, ordered)``.
 
 .. ipython:: python
 
@@ -468,12 +469,28 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
+Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical``
+whose ``categories`` are the unique values observed in the data. For more
+control on the categories and order, create a
+:class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for
+that column's ``dtype``.
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
+   pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes
+
 .. note::
 
-   The resulting categories will always be parsed as strings (object dtype).
-   If the categories are numeric they can be converted using the
-   :func:`to_numeric` function, or as appropriate, another converter
-   such as :func:`to_datetime`.
+   With ``dtype='category'``, the resulting categories will always be parsed
+   as strings (object dtype). If the categories are numeric they can be
+   converted using the :func:`to_numeric` function, or as appropriate, another
+   converter such as :func:`to_datetime`.
+
+   When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categoriess`` (
+   all numeric, all datetimes, etc.), the conversion is done automatically.
 
    .. ipython:: python
 

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -163,6 +163,8 @@ Other Enhancements
 - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`)
 - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`)
 - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
+- Pass a :class:`~pandas.api.types.CategoricalDtype` to :meth:`read_csv` to parse categorical
+  data as numeric, datetimes, or timedeltas, instead of strings. See :ref:`here <io.categorical>`. (:issue:`17643`)
 
 
 .. _whatsnew_0210.api_breaking:

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -48,7 +48,7 @@ from pandas.core.dtypes.common import (
 from pandas.core.categorical import Categorical
 from pandas.core.algorithms import take_1d
 from pandas.core.dtypes.concat import union_categoricals
-from pandas import Index
+from pandas import Index, to_numeric, to_datetime, to_timedelta
 
 import pandas.io.common as com
 
@@ -1267,19 +1267,49 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         elif is_categorical_dtype(dtype):
+            # TODO: I suspect that _categorical_convert could be
+            # optimized when dtype is an instance of CategoricalDtype
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
-            # sort categories and recode if necessary
             cats = Index(cats)
-            if not cats.is_monotonic_increasing:
+
+            # Determine if we should convert inferred string
+            # categories to a specialized type
+            if (isinstance(dtype, CategoricalDtype) and
+                    dtype.categories is not None):
+                if dtype.categories.is_numeric():
+                    # is ignore correct?
+                    cats = to_numeric(cats, errors='ignore')
+                elif dtype.categories.is_all_dates:
+                    # is ignore correct?
+                    if is_datetime64_dtype(dtype.categories):
+                        cats = to_datetime(cats, errors='ignore')
+                    else:
+                        cats = to_timedelta(cats, errors='ignore')
+
+            if (isinstance(dtype, CategoricalDtype) and
+                    dtype.categories is not None):
+                # recode for dtype.categories
+                categories = dtype.categories
+                indexer = categories.get_indexer(cats)
+                codes = take_1d(indexer, codes, fill_value=-1)
+                ordered = dtype.ordered
+            elif not cats.is_monotonic_increasing:
+                # sort categories and recode if necessary
                 unsorted = cats.copy()
-                cats = cats.sort_values()
-                indexer = cats.get_indexer(unsorted)
+                categories = cats.sort_values()
+                indexer = categories.get_indexer(unsorted)
                 codes = take_1d(indexer, codes, fill_value=-1)
+                ordered = False
+            else:
+                categories = cats
+                ordered = False
+
+            cat = Categorical(codes, categories=categories, ordered=ordered,
+                              fastpath=True)
 
-            return Categorical(codes, categories=cats, ordered=False,
-                               fastpath=True), na_count
+            return cat, na_count
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
@@ -2230,8 +2260,11 @@ def _concatenate_chunks(list chunks):
             if common_type == np.object:
                 warning_columns.append(str(name))
 
-        if is_categorical_dtype(dtypes.pop()):
-            result[name] = union_categoricals(arrs, sort_categories=True)
+        dtype = dtypes.pop()
+        if is_categorical_dtype(dtype):
+            sort_categories = isinstance(dtype, str)
+            result[name] = union_categoricals(arrs,
+                                              sort_categories=sort_categories)
         else:
             result[name] = np.concatenate(arrs)
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -12,15 +12,17 @@
 
 import numpy as np
 
-from pandas import compat
+from pandas import compat, to_numeric, to_timedelta
 from pandas.compat import (range, lrange, PY3, StringIO, lzip,
                            zip, string_types, map, u)
 from pandas.core.dtypes.common import (
     is_integer, _ensure_object,
     is_list_like, is_integer_dtype,
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
-    is_scalar, is_categorical_dtype)
+    is_scalar, is_categorical_dtype,
+    is_datetime64_dtype, is_timedelta64_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
@@ -1605,9 +1607,23 @@ def _cast_types(self, values, cast_type, column):
             # XXX this is for consistency with
             # c-parser which parses all categories
             # as strings
-            if not is_object_dtype(values):
+            known_cats = (isinstance(cast_type, CategoricalDtype) and
+                          cast_type.categories is not None)
+            str_values = is_object_dtype(values)
+
+            if known_cats and str_values:
+                if cast_type.categories.is_numeric():
+                    values = to_numeric(values, errors='ignore')
+                elif is_datetime64_dtype(cast_type.categories):
+                    values = tools.to_datetime(values, errors='ignore')
+                elif is_timedelta64_dtype(cast_type.categories):
+                    values = to_timedelta(values, errors='ignore')
+                values = Categorical(values, categories=cast_type.categories,
+                                     ordered=cast_type.ordered)
+            elif not is_object_dtype(values):
                 values = astype_nansafe(values, str)
-            values = Categorical(values)
+            else:
+                values = Categorical(values)
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)

diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -149,6 +149,89 @@ def test_categorical_dtype_chunksize(self):
         for actual, expected in zip(actuals, expecteds):
             tm.assert_frame_equal(actual, expected)
 
+    @pytest.mark.parametrize('ordered', [False, True])
+    @pytest.mark.parametrize('categories', [
+        ['a', 'b', 'c'],
+        ['a', 'c', 'b'],
+        ['a', 'b', 'c', 'd'],
+        ['c', 'b', 'a'],
+    ])
+    def test_categorical_categoricaldtype(self, categories, ordered):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        expected = pd.DataFrame({
+            "a": [1, 1, 1, 2],
+            "b": Categorical(['a', 'b', 'b', 'c'],
+                             categories=categories,
+                             ordered=ordered)
+        })
+        dtype = {"b": CategoricalDtype(categories=categories,
+                                       ordered=ordered)}
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_categoricaldtype_unsorted(self):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        dtype = CategoricalDtype(['c', 'b', 'a'])
+        expected = pd.DataFrame({
+            'a': [1, 1, 1, 2],
+            'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a'])
+        })
+        result = self.read_csv(StringIO(data), dtype={'b': dtype})
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_numeric(self):
+        dtype = {'b': CategoricalDtype([1, 2, 3])}
+        data = "b\n1\n1\n2\n3"
+        expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_datetime(self):
+        dtype = {
+            'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS'))
+        }
+        data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
+        expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_timedelta(self):
+        dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))}
+        data = "b\n1H\n2H\n3H"
+        expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_categoricaldtype_chunksize(self):
+        # GH 10153
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        cats = ['a', 'b', 'c']
+        expecteds = [pd.DataFrame({'a': [1, 1],
+                                   'b': Categorical(['a', 'b'],
+                                                    categories=cats)}),
+                     pd.DataFrame({'a': [1, 2],
+                                   'b': Categorical(['b', 'c'],
+                                                    categories=cats)},
+                                  index=[2, 3])]
+        dtype = CategoricalDtype(cats)
+        actuals = self.read_csv(StringIO(data), dtype={'b': dtype},
+                                chunksize=2)
+
+        for actual, expected in zip(actuals, expecteds):
+            tm.assert_frame_equal(actual, expected)
+
     def test_empty_pass_dtype(self):
         data = 'one,two'
         result = self.read_csv(StringIO(data), dtype={'one': 'u1'})