pandas-dev · TomAugspurger · Oct 2, 2017 · Aug 31, 2017 · Sep 24, 2017 · Sep 24, 2017
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -452,7 +452,8 @@ Specifying Categorical dtype
 
 .. versionadded:: 0.19.0
 
-``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
+``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or
+``dtype=CategoricalDtype(categories, ordered)``.
 
 .. ipython:: python
 
@@ -468,12 +469,40 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
+.. versionadded:: 0.21.0
+
+Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical``
+whose ``categories`` are the unique values observed in the data. For more
+control on the categories and order, create a
+:class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for
+that column's ``dtype``.
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
+   pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes
+
+When using ``dtype=CategoricalDtype``, "unexpected" values outside of
+``dtype.categories`` are treated as missing values.
+
+.. ipython:: python
+
+   dtype = CategoricalDtype(['a', 'b', 'd'])  # No 'c'
+   pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1
+
+This matches the behavior of :meth:`Categorical.set_categories`.
+
 .. note::
 
-   The resulting categories will always be parsed as strings (object dtype).
-   If the categories are numeric they can be converted using the
-   :func:`to_numeric` function, or as appropriate, another converter
-   such as :func:`to_datetime`.
+   With ``dtype='category'``, the resulting categories will always be parsed
+   as strings (object dtype). If the categories are numeric they can be
+   converted using the :func:`to_numeric` function, or as appropriate, another
+   converter such as :func:`to_datetime`.
+
+   When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categories`` (
+   all numeric, all datetimes, etc.), the conversion is done automatically.
 
    .. ipython:: python
 

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -119,7 +119,7 @@ expanded to include the ``categories`` and ``ordered`` attributes. A
 ``CategoricalDtype`` can be used to specify the set of categories and
 orderedness of an array, independent of the data themselves. This can be useful,
 e.g., when converting string data to a ``Categorical`` (:issue:`14711`,
-:issue:`15078`, :issue:`16015`):
+:issue:`15078`, :issue:`16015`, :issue:`17643`):
 
 .. ipython:: python
 
@@ -129,8 +129,37 @@ e.g., when converting string data to a ``Categorical`` (:issue:`14711`,
    dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True)
    s.astype(dtype)
 
+One place that deserves special mention is in :meth:`read_csv`. Previously, with
+``dtype={'col': 'category'}``, the returned values and categories would always
+be strings.
+
+.. ipython:: python
+   :suppress:
+
+   from pandas.compat import StringIO
+
+.. ipython:: python
+
+   data = 'A,B\na,1\nb,2\nc,3'
+   pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories
+
+Notice the "object" dtype.
+
+With a ``CategoricalDtype`` of all numerics, datetimes, or
+timedeltas, we can automatically convert to the correct type
+
+    dtype = {'B': CategoricalDtype([1, 2, 3])}
+    pd.read_csv(StringIO(data), dtype=dtype).B.cat.categories
+
+The values have been correctly interpreted as integers.
+
 The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a
-``Series`` with categorical type will now return an instance of ``CategoricalDtype``.
+``Series`` with categorical type will now return an instance of
+``CategoricalDtype``. For the most part, this is backwards compatible, though
+the string repr has changed. If you were previously using ``str(s.dtype) ==
+'category'`` to detect categorical data, switch to
+:func:`pandas.api.types.is_categorical_dtype`, which is compatible with the old
+and new ``CategoricalDtype``.
 
 See the :ref:`CategoricalDtype docs <categorical.categoricaldtype>` for more.
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -45,9 +45,10 @@ from pandas.core.dtypes.common import (
     is_bool_dtype, is_object_dtype,
     is_string_dtype, is_datetime64_dtype,
     pandas_dtype)
-from pandas.core.categorical import Categorical
+from pandas.core.categorical import Categorical, _recode_for_categories
 from pandas.core.algorithms import take_1d
 from pandas.core.dtypes.concat import union_categoricals
+from pandas.core.dtypes.cast import maybe_convert_for_categorical
 from pandas import Index
 
 import pandas.io.common as com
@@ -1267,19 +1268,35 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         elif is_categorical_dtype(dtype):
+            # TODO: I suspect that _categorical_convert could be
+            # optimized when dtype is an instance of CategoricalDtype
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
-            # sort categories and recode if necessary
             cats = Index(cats)
-            if not cats.is_monotonic_increasing:
+
+            cats = maybe_convert_for_categorical(cats, dtype)
+
+            if (isinstance(dtype, CategoricalDtype) and
+                    dtype.categories is not None):
+                # recode for dtype.categories
+                categories = dtype.categories
+                codes = _recode_for_categories(codes, cats, categories)
+                ordered = dtype.ordered
+            elif not cats.is_monotonic_increasing:
+                # sort categories and recode if necessary
                 unsorted = cats.copy()
-                cats = cats.sort_values()
-                indexer = cats.get_indexer(unsorted)
-                codes = take_1d(indexer, codes, fill_value=-1)
+                categories = cats.sort_values()
+                codes = _recode_for_categories(codes, unsorted, categories)
+                ordered = False
+            else:
+                categories = cats
+                ordered = False
+
+            cat = Categorical(codes, categories=categories, ordered=ordered,
+                              fastpath=True)
 
-            return Categorical(codes, categories=cats, ordered=False,
-                               fastpath=True), na_count
+            return cat, na_count
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
@@ -2230,8 +2247,11 @@ def _concatenate_chunks(list chunks):
             if common_type == np.object:
                 warning_columns.append(str(name))
 
-        if is_categorical_dtype(dtypes.pop()):
-            result[name] = union_categoricals(arrs, sort_categories=True)
+        dtype = dtypes.pop()
+        if is_categorical_dtype(dtype):
+            sort_categories = isinstance(dtype, str)
+            result[name] = union_categoricals(arrs,
+                                              sort_categories=sort_categories)
         else:
             result[name] = np.concatenate(arrs)
 

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -24,7 +24,8 @@
                      _ensure_int32, _ensure_int64,
                      _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE,
                      _POSSIBLY_CAST_DTYPES)
-from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype
+from .dtypes import (ExtensionDtype, DatetimeTZDtype, PeriodDtype,
+                     CategoricalDtype)
 from .generic import (ABCDatetimeIndex, ABCPeriodIndex,
                       ABCSeries)
 from .missing import isna, notna
@@ -604,6 +605,41 @@ def conv(r, dtype):
     return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
 
 
+def maybe_convert_for_categorical(categories, dtype):
+    """Convert ``categories`` depending on ``dtype``.
+
+    Converts to numeric, datetime, or timedelta types, when ``dtype`` is
+    a CategoricalDtype with known, non-object categories.
+
+    Parameters
+    ----------
+    categories : array-like
+    type : CategoricalDtype
+
+    Returns
+    -------
+    new_categories : array or Index
+
+    Examples
+    --------
+    >>> maybe_convert_for_categorical(['1', '2'], CategoricalDtype([1, 2]))
+    array([  1,  2])
+    >>> maybe_convert_for_categorical([1, 'a'], CategoricalDtype([1, 2]))
+    array([  1.,  nan])
+    """
+    if isinstance(dtype, CategoricalDtype) and dtype.categories is not None:
+        from pandas import to_numeric, to_datetime, to_timedelta
+
+        if dtype.categories.is_numeric():
+            categories = to_numeric(categories, errors='coerce')
+        elif is_datetime64_dtype(dtype.categories):
+            categories = to_datetime(categories, errors='coerce')
+        elif is_timedelta64_dtype(dtype.categories):
+            categories = to_timedelta(categories, errors='coerce')
+
+    return categories
+
+
 def astype_nansafe(arr, dtype, copy=True):
     """ return a view if copy is False, but
         need to be very careful as the result shape could change! """

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -21,8 +21,10 @@
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
     is_scalar, is_categorical_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
-from pandas.core.dtypes.cast import astype_nansafe
+from pandas.core.dtypes.cast import (astype_nansafe,
+                                     maybe_convert_for_categorical)
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
                                _ensure_index_from_sequences)
 from pandas.core.series import Series
@@ -1605,9 +1607,18 @@ def _cast_types(self, values, cast_type, column):
             # XXX this is for consistency with
             # c-parser which parses all categories
             # as strings
-            if not is_object_dtype(values):
+            known_cats = (isinstance(cast_type, CategoricalDtype) and
+                          cast_type.categories is not None)
+
+            categories = ordered = None
+            if known_cats:
+                values = maybe_convert_for_categorical(values, cast_type)
+                categories = cast_type.categories
+                ordered = cast_type.ordered
+            elif not is_object_dtype(values):
                 values = astype_nansafe(values, str)
-            values = Categorical(values)
+            values = Categorical(values, categories=categories,
+                                 ordered=ordered)
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)

diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
@@ -16,6 +16,7 @@
 from pandas.core.dtypes.cast import (
     maybe_downcast_to_dtype,
     maybe_convert_objects,
+    maybe_convert_for_categorical,
     cast_scalar_to_array,
     infer_dtype_from_scalar,
     infer_dtype_from_array,
@@ -299,6 +300,44 @@ def test_maybe_infer_to_datetimelike(self):
                                      [NaT, 'b', 1]]))
         assert result.size == 6
 
+    def test_maybe_convert_for_categorical_noop(self):
+        expected = ['1', '2']
+        result = maybe_convert_for_categorical(expected, None)
+        assert result == expected
+
+        result = maybe_convert_for_categorical(expected, CategoricalDtype())
+        assert result == expected
+
+        result = maybe_convert_for_categorical(expected, 'category')
+        assert result == expected
+
+    @pytest.mark.parametrize('categories, dtype, expected', [
+        (['1', '2'], [1, 2, 3], np.array([1, 2], dtype='i8')),
+        (['1', '2', 'a'], [1, 2, 3], np.array([1, 2, np.nan], dtype='f8')),
+    ])
+    def test_maybe_convert_for_categorical(self, categories, dtype, expected):
+        dtype = CategoricalDtype(dtype)
+        result = maybe_convert_for_categorical(categories, dtype)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize('categories, dtype, expected', [
+        (['2016', '2017'], pd.to_datetime(['2016', '2017']),
+         pd.to_datetime(['2016', '2017'])),
+        (['2016', '2017', 'bad'], pd.to_datetime(['2016', '2017']),
+         pd.to_datetime(['2016', '2017', 'NaT'])),
+
+        (['1H', '2H'], pd.to_timedelta(['1H', '2H']),
+         pd.to_timedelta(['1H', '2H'])),
+        (['1H', '2H', 'bad'], pd.to_timedelta(['1H', '2H']),
+         pd.to_timedelta(['1H', '2H', 'NaT'])),
+
+    ])
+    def test_maybe_convert_for_categorical_dates(self, categories, dtype,
+                                                 expected):
+        dtype = CategoricalDtype(dtype)
+        result = maybe_convert_for_categorical(categories, dtype)
+        tm.assert_index_equal(result, expected)
+
 
 class TestConvert(object):