ENH: Accept CategoricalDtype in CSV reader

TomAugspurger · TomAugspurger · commit ccbaa04f1745 · 2017-09-23T06:28:25.000-05:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -452,7 +452,8 @@ Specifying Categorical dtype
 
 .. versionadded:: 0.19.0
 
-``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
+``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or
+``dtype=CategoricalDtype(categories, ordered)``.
 
 .. ipython:: python
 
@@ -468,6 +469,18 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
+Specifying ``dtype='cateogry'`` will result in a ``Categorical`` that is
+unordered, and whose ``categories`` are the unique values observed in the data.
+For more control on the categories and order, create a
+:class:`~pandas.api.types.CategoricalDtype` ahead of time.
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
+   pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes
+
 .. note::
 
    The resulting categories will always be parsed as strings (object dtype).
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1267,6 +1267,8 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         elif is_categorical_dtype(dtype):
+            # TODO: I suspect that this could be optimized when dtype
+            # is an instance of CategoricalDtype
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
@@ -1278,8 +1280,18 @@ cdef class TextReader:
                 indexer = cats.get_indexer(unsorted)
                 codes = take_1d(indexer, codes, fill_value=-1)
 
-            return Categorical(codes, categories=cats, ordered=False,
-                               fastpath=True), na_count
+            cat = Categorical(codes, categories=cats, ordered=False,
+                              fastpath=True)
+
+            if isinstance(dtype, CategoricalDtype):
+                if dtype.categories is None:
+                    # skip recoding
+                    if dtype.ordered:
+                        cat = cat.set_ordered(ordered=dtype.ordered)
+                else:
+                    cat = cat.set_categories(dtype.categories,
+                                             ordered=dtype.ordered)
+            return cat, na_count
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
@@ -2230,8 +2242,11 @@ def _concatenate_chunks(list chunks):
             if common_type == np.object:
                 warning_columns.append(str(name))
 
-        if is_categorical_dtype(dtypes.pop()):
-            result[name] = union_categoricals(arrs, sort_categories=True)
+        dtype = dtypes.pop()
+        if is_categorical_dtype(dtype):
+            sort_categories = isinstance(dtype, str)
+            result[name] = union_categoricals(arrs,
+                                              sort_categories=sort_categories)
         else:
             result[name] = np.concatenate(arrs)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -21,6 +21,7 @@
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
     is_scalar, is_categorical_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
@@ -1578,7 +1579,11 @@ def _cast_types(self, values, cast_type, column):
             # as strings
             if not is_object_dtype(values):
                 values = astype_nansafe(values, str)
-            values = Categorical(values)
+            if isinstance(cast_type, CategoricalDtype):
+                values = Categorical(values, categories=cast_type.categories,
+                                     ordered=cast_type.ordered)
+            else:
+                values = Categorical(values)
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -149,6 +149,51 @@ def test_categorical_dtype_chunksize(self):
         for actual, expected in zip(actuals, expecteds):
             tm.assert_frame_equal(actual, expected)
 
+    @pytest.mark.parametrize('ordered', [False, True])
+    @pytest.mark.parametrize('categories', [
+        ['a', 'b', 'c'],
+        ['a', 'c', 'b'],
+        ['a', 'b', 'c', 'd'],
+    ])
+    def test_categorical_categoricaldtype(self, categories, ordered):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        expected = pd.DataFrame({
+            "a": [1, 1, 1, 2],
+            "b": Categorical(['a', 'b', 'b', 'c'],
+                             categories=categories,
+                             ordered=ordered)
+        })
+        dtype = {"b": CategoricalDtype(categories=categories,
+                                       ordered=ordered)}
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_categoricaldtype_chunksize(self):
+        # GH 10153
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        cats = ['a', 'b', 'c']
+        expecteds = [pd.DataFrame({'a': [1, 1],
+                                   'b': Categorical(['a', 'b'],
+                                                    categories=cats)}),
+                     pd.DataFrame({'a': [1, 2],
+                                   'b': Categorical(['b', 'c'],
+                                                    categories=cats)},
+                                  index=[2, 3])]
+        dtype = CategoricalDtype(cats)
+        actuals = self.read_csv(StringIO(data), dtype={'b': dtype},
+                                chunksize=2)
+
+        for actual, expected in zip(actuals, expecteds):
+            tm.assert_frame_equal(actual, expected)
+
     def test_empty_pass_dtype(self):
         data = 'one,two'
         result = self.read_csv(StringIO(data), dtype={'one': 'u1'})