ENH: Accept CategoricalDtype in read_csv (pandas-dev#17643)

TomAugspurger · web-flow · commit def3bce010eb · 2017-10-02T09:10:42.000-05:00
* ENH: Accept CategoricalDtype in CSV reader

* rework

* Fixed basic implementation

* Added casting

* Doc and cleanup

* Fixed assignment of categoricals

* Doc and test unexpected values

* DOC: fixups

* More coercion, use _recode_for_categories

* Refactor with maybe_convert_for_categorical

* PEP8

* Type for 32bit

* REF: refactor to new method

* py2 compat

* Refactored

* More in Categorical

* fixup! More in Categorical
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -452,7 +452,8 @@ Specifying Categorical dtype
 
 .. versionadded:: 0.19.0
 
-``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
+``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` or
+``dtype=CategoricalDtype(categories, ordered)``.
 
 .. ipython:: python
 
@@ -468,12 +469,40 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
+.. versionadded:: 0.21.0
+
+Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical``
+whose ``categories`` are the unique values observed in the data. For more
+control on the categories and order, create a
+:class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for
+that column's ``dtype``.
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
+   pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes
+
+When using ``dtype=CategoricalDtype``, "unexpected" values outside of
+``dtype.categories`` are treated as missing values.
+
+.. ipython:: python
+
+   dtype = CategoricalDtype(['a', 'b', 'd'])  # No 'c'
+   pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1
+
+This matches the behavior of :meth:`Categorical.set_categories`.
+
 .. note::
 
-   The resulting categories will always be parsed as strings (object dtype).
-   If the categories are numeric they can be converted using the
-   :func:`to_numeric` function, or as appropriate, another converter
-   such as :func:`to_datetime`.
+   With ``dtype='category'``, the resulting categories will always be parsed
+   as strings (object dtype). If the categories are numeric they can be
+   converted using the :func:`to_numeric` function, or as appropriate, another
+   converter such as :func:`to_datetime`.
+
+   When ``dtype`` is a ``CategoricalDtype`` with homogenous ``categories`` (
+   all numeric, all datetimes, etc.), the conversion is done automatically.
 
    .. ipython:: python
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -119,7 +119,7 @@ expanded to include the ``categories`` and ``ordered`` attributes. A
 ``CategoricalDtype`` can be used to specify the set of categories and
 orderedness of an array, independent of the data themselves. This can be useful,
 e.g., when converting string data to a ``Categorical`` (:issue:`14711`,
-:issue:`15078`, :issue:`16015`):
+:issue:`15078`, :issue:`16015`, :issue:`17643`):
 
 .. ipython:: python
 
@@ -129,8 +129,37 @@ e.g., when converting string data to a ``Categorical`` (:issue:`14711`,
    dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True)
    s.astype(dtype)
 
+One place that deserves special mention is in :meth:`read_csv`. Previously, with
+``dtype={'col': 'category'}``, the returned values and categories would always
+be strings.
+
+.. ipython:: python
+   :suppress:
+
+   from pandas.compat import StringIO
+
+.. ipython:: python
+
+   data = 'A,B\na,1\nb,2\nc,3'
+   pd.read_csv(StringIO(data), dtype={'B': 'category'}).B.cat.categories
+
+Notice the "object" dtype.
+
+With a ``CategoricalDtype`` of all numerics, datetimes, or
+timedeltas, we can automatically convert to the correct type
+
+    dtype = {'B': CategoricalDtype([1, 2, 3])}
+    pd.read_csv(StringIO(data), dtype=dtype).B.cat.categories
+
+The values have been correctly interpreted as integers.
+
 The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a
-``Series`` with categorical type will now return an instance of ``CategoricalDtype``.
+``Series`` with categorical type will now return an instance of
+``CategoricalDtype``. For the most part, this is backwards compatible, though
+the string repr has changed. If you were previously using ``str(s.dtype) ==
+'category'`` to detect categorical data, switch to
+:func:`pandas.api.types.is_categorical_dtype`, which is compatible with the old
+and new ``CategoricalDtype``.
 
 See the :ref:`CategoricalDtype docs <categorical.categoricaldtype>` for more.
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -45,7 +45,7 @@ from pandas.core.dtypes.common import (
     is_bool_dtype, is_object_dtype,
     is_string_dtype, is_datetime64_dtype,
     pandas_dtype)
-from pandas.core.categorical import Categorical
+from pandas.core.categorical import Categorical, _recode_for_categories
 from pandas.core.algorithms import take_1d
 from pandas.core.dtypes.concat import union_categoricals
 from pandas import Index
@@ -1267,19 +1267,14 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         elif is_categorical_dtype(dtype):
+            # TODO: I suspect that _categorical_convert could be
+            # optimized when dtype is an instance of CategoricalDtype
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
-            # sort categories and recode if necessary
-            cats = Index(cats)
-            if not cats.is_monotonic_increasing:
-                unsorted = cats.copy()
-                cats = cats.sort_values()
-                indexer = cats.get_indexer(unsorted)
-                codes = take_1d(indexer, codes, fill_value=-1)
-
-            return Categorical(codes, categories=cats, ordered=False,
-                               fastpath=True), na_count
+            cat = Categorical._from_inferred_categories(cats, codes, dtype)
+            return cat, na_count
+
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
@@ -2230,8 +2225,11 @@ def _concatenate_chunks(list chunks):
             if common_type == np.object:
                 warning_columns.append(str(name))
 
-        if is_categorical_dtype(dtypes.pop()):
-            result[name] = union_categoricals(arrs, sort_categories=True)
+        dtype = dtypes.pop()
+        if is_categorical_dtype(dtype):
+            sort_categories = isinstance(dtype, str)
+            result[name] = union_categoricals(arrs,
+                                              sort_categories=sort_categories)
         else:
             result[name] = np.concatenate(arrs)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -21,6 +21,8 @@
     _ensure_platform_int,
     is_dtype_equal,
     is_datetimelike,
+    is_datetime64_dtype,
+    is_timedelta64_dtype,
     is_categorical,
     is_categorical_dtype,
     is_integer_dtype,
@@ -510,6 +512,59 @@ def base(self):
         """ compat, we are always our own object """
         return None
 
+    @classmethod
+    def _from_inferred_categories(cls, inferred_categories, inferred_codes,
+                                  dtype):
+        """Construct a Categorical from inferred values
+
+        For inferred categories (`dtype` is None) the categories are sorted.
+        For explicit `dtype`, the `inferred_categories` are cast to the
+        appropriate type.
+
+        Parameters
+        ----------
+
+        inferred_categories : Index
+        inferred_codes : Index
+        dtype : CategoricalDtype or 'category'
+
+        Returns
+        -------
+        Categorical
+        """
+        from pandas import Index, to_numeric, to_datetime, to_timedelta
+
+        cats = Index(inferred_categories)
+
+        known_categories = (isinstance(dtype, CategoricalDtype) and
+                            dtype.categories is not None)
+
+        if known_categories:
+            # Convert to a specialzed type with `dtype` if specified
+            if dtype.categories.is_numeric():
+                cats = to_numeric(inferred_categories, errors='coerce')
+            elif is_datetime64_dtype(dtype.categories):
+                cats = to_datetime(inferred_categories, errors='coerce')
+            elif is_timedelta64_dtype(dtype.categories):
+                cats = to_timedelta(inferred_categories, errors='coerce')
+
+        if known_categories:
+            # recode from observation oder to dtype.categories order
+            categories = dtype.categories
+            codes = _recode_for_categories(inferred_codes, cats, categories)
+        elif not cats.is_monotonic_increasing:
+            # sort categories and recode for unknown categories
+            unsorted = cats.copy()
+            categories = cats.sort_values()
+            codes = _recode_for_categories(inferred_codes, unsorted,
+                                           categories)
+            dtype = CategoricalDtype(categories, ordered=False)
+        else:
+            dtype = CategoricalDtype(cats, ordered=False)
+            codes = inferred_codes
+
+        return cls(codes, dtype=dtype, fastpath=True)
+
     @classmethod
     def from_array(cls, data, **kwargs):
         """
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -21,6 +21,7 @@
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
     is_scalar, is_categorical_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.index import (Index, MultiIndex, RangeIndex,
@@ -1602,12 +1603,20 @@ def _cast_types(self, values, cast_type, column):
         """
 
         if is_categorical_dtype(cast_type):
-            # XXX this is for consistency with
-            # c-parser which parses all categories
-            # as strings
-            if not is_object_dtype(values):
+            known_cats = (isinstance(cast_type, CategoricalDtype) and
+                          cast_type.categories is not None)
+
+            if not is_object_dtype(values) and not known_cats:
+                # XXX this is for consistency with
+                # c-parser which parses all categories
+                # as strings
                 values = astype_nansafe(values, str)
-            values = Categorical(values)
+
+            cats = Index(values).unique().dropna()
+            values = Categorical._from_inferred_categories(
+                cats, cats.get_indexer(values), cast_type
+            )
+
         else:
             try:
                 values = astype_nansafe(values, cast_type, copy=True)
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -149,6 +149,105 @@ def test_categorical_dtype_chunksize(self):
         for actual, expected in zip(actuals, expecteds):
             tm.assert_frame_equal(actual, expected)
 
+    @pytest.mark.parametrize('ordered', [False, True])
+    @pytest.mark.parametrize('categories', [
+        ['a', 'b', 'c'],
+        ['a', 'c', 'b'],
+        ['a', 'b', 'c', 'd'],
+        ['c', 'b', 'a'],
+    ])
+    def test_categorical_categoricaldtype(self, categories, ordered):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        expected = pd.DataFrame({
+            "a": [1, 1, 1, 2],
+            "b": Categorical(['a', 'b', 'b', 'c'],
+                             categories=categories,
+                             ordered=ordered)
+        })
+        dtype = {"b": CategoricalDtype(categories=categories,
+                                       ordered=ordered)}
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_categoricaldtype_unsorted(self):
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        dtype = CategoricalDtype(['c', 'b', 'a'])
+        expected = pd.DataFrame({
+            'a': [1, 1, 1, 2],
+            'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a'])
+        })
+        result = self.read_csv(StringIO(data), dtype={'b': dtype})
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_numeric(self):
+        dtype = {'b': CategoricalDtype([1, 2, 3])}
+        data = "b\n1\n1\n2\n3"
+        expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_datetime(self):
+        dtype = {
+            'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS'))
+        }
+        data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
+        expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+        dtype = {
+            'b': CategoricalDtype([pd.Timestamp("2014")])
+        }
+        data = "b\n2014-01-01\n2014-01-01T00:00:00"
+        expected = pd.DataFrame({'b': Categorical([pd.Timestamp('2014')] * 2)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_coerces_timedelta(self):
+        dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))}
+        data = "b\n1H\n2H\n3H"
+        expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categoricaldtype_unexpected_categories(self):
+        dtype = {'b': CategoricalDtype(['a', 'b', 'd', 'e'])}
+        data = "b\nd\na\nc\nd"  # Unexpected c
+        expected = pd.DataFrame({"b": Categorical(list('dacd'),
+                                                  dtype=dtype['b'])})
+        result = self.read_csv(StringIO(data), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_categoricaldtype_chunksize(self):
+        # GH 10153
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        cats = ['a', 'b', 'c']
+        expecteds = [pd.DataFrame({'a': [1, 1],
+                                   'b': Categorical(['a', 'b'],
+                                                    categories=cats)}),
+                     pd.DataFrame({'a': [1, 2],
+                                   'b': Categorical(['b', 'c'],
+                                                    categories=cats)},
+                                  index=[2, 3])]
+        dtype = CategoricalDtype(cats)
+        actuals = self.read_csv(StringIO(data), dtype={'b': dtype},
+                                chunksize=2)
+
+        for actual, expected in zip(actuals, expecteds):
+            tm.assert_frame_equal(actual, expected)
+
     def test_empty_pass_dtype(self):
         data = 'one,two'
         result = self.read_csv(StringIO(data), dtype={'one': 'u1'})
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py