BUG: Properly handle CSV boolean CategoricalDtype (pandas-dev#20826)

TomAugspurger · Pingviinituutti · commit 3f967dcd59a0 · 2019-02-28T10:19:12.000+02:00
Previously, was being parsed as object instead of boolean. Closes pandas-devgh-20498. Original Author: @TomAugspurger Rebased by @gfyoung due to merge conflicts.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1375,6 +1375,7 @@ Current Behavior:
 
 Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
 
+- Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`)
 - Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
 - Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
 - Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1202,7 +1202,20 @@ cdef class TextReader:
                              bint user_dtype,
                              kh_str_t *na_hashset,
                              object na_flist):
-        if is_integer_dtype(dtype):
+        if is_categorical_dtype(dtype):
+            # TODO: I suspect that _categorical_convert could be
+            # optimized when dtype is an instance of CategoricalDtype
+            codes, cats, na_count = _categorical_convert(
+                self.parser, i, start, end, na_filter,
+                na_hashset, self.c_encoding)
+
+            # Method accepts list of strings, not encoded ones.
+            true_values = [x.decode() for x in self.true_values]
+            cat = Categorical._from_inferred_categories(
+                cats, codes, dtype, true_values=true_values)
+            return cat, na_count
+
+        elif is_integer_dtype(dtype):
             try:
                 result, na_count = _try_int64(self.parser, i, start,
                                               end, na_filter, na_hashset)
@@ -1233,6 +1246,7 @@ cdef class TextReader:
                                               na_filter, na_hashset,
                                               self.true_set, self.false_set)
             return result, na_count
+
         elif dtype.kind == 'S':
             # TODO: na handling
             width = dtype.itemsize
@@ -1252,15 +1266,6 @@ cdef class TextReader:
             # unicode variable width
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
-        elif is_categorical_dtype(dtype):
-            # TODO: I suspect that _categorical_convert could be
-            # optimized when dtype is an instance of CategoricalDtype
-            codes, cats, na_count = _categorical_convert(
-                self.parser, i, start, end, na_filter,
-                na_hashset, self.c_encoding)
-            cat = Categorical._from_inferred_categories(cats, codes, dtype)
-            return cat, na_count
-
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -546,19 +546,22 @@ def base(self):
 
     @classmethod
     def _from_inferred_categories(cls, inferred_categories, inferred_codes,
-                                  dtype):
-        """Construct a Categorical from inferred values
+                                  dtype, true_values=None):
+        """
+        Construct a Categorical from inferred values.
 
         For inferred categories (`dtype` is None) the categories are sorted.
         For explicit `dtype`, the `inferred_categories` are cast to the
         appropriate type.
 
         Parameters
         ----------
-
         inferred_categories : Index
         inferred_codes : Index
         dtype : CategoricalDtype or 'category'
+        true_values : list, optional
+            If none are provided, the default ones are
+            "True", "TRUE", and "true."
 
         Returns
         -------
@@ -567,27 +570,32 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
         from pandas import Index, to_numeric, to_datetime, to_timedelta
 
         cats = Index(inferred_categories)
-
         known_categories = (isinstance(dtype, CategoricalDtype) and
                             dtype.categories is not None)
 
         if known_categories:
-            # Convert to a specialzed type with `dtype` if specified
+            # Convert to a specialized type with `dtype` if specified.
             if dtype.categories.is_numeric():
-                cats = to_numeric(inferred_categories, errors='coerce')
+                cats = to_numeric(inferred_categories, errors="coerce")
             elif is_datetime64_dtype(dtype.categories):
-                cats = to_datetime(inferred_categories, errors='coerce')
+                cats = to_datetime(inferred_categories, errors="coerce")
             elif is_timedelta64_dtype(dtype.categories):
-                cats = to_timedelta(inferred_categories, errors='coerce')
+                cats = to_timedelta(inferred_categories, errors="coerce")
+            elif dtype.categories.is_boolean():
+                if true_values is None:
+                    true_values = ["True", "TRUE", "true"]
+
+                cats = cats.isin(true_values)
 
         if known_categories:
-            # recode from observation order to dtype.categories order
+            # Recode from observation order to dtype.categories order.
             categories = dtype.categories
             codes = _recode_for_categories(inferred_codes, cats, categories)
         elif not cats.is_monotonic_increasing:
-            # sort categories and recode for unknown categories
+            # Sort categories and recode for unknown categories.
             unsorted = cats.copy()
             categories = cats.sort_values()
+
             codes = _recode_for_categories(inferred_codes, unsorted,
                                            categories)
             dtype = CategoricalDtype(categories, ordered=False)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1752,8 +1752,8 @@ def _cast_types(self, values, cast_type, column):
 
             cats = Index(values).unique().dropna()
             values = Categorical._from_inferred_categories(
-                cats, cats.get_indexer(values), cast_type
-            )
+                cats, cats.get_indexer(values), cast_type,
+                true_values=self.true_values)
 
         else:
             try:
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
@@ -324,6 +324,22 @@ def test_categorical_coerces_timedelta(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("data", [
+    "b\nTrue\nFalse\nNA\nFalse",
+    "b\ntrue\nfalse\nNA\nfalse",
+    "b\nTRUE\nFALSE\nNA\nFALSE",
+    "b\nTrue\nFalse\nNA\nFALSE",
+])
+def test_categorical_dtype_coerces_boolean(all_parsers, data):
+    # see gh-20498
+    parser = all_parsers
+    dtype = {"b": CategoricalDtype([False, True])}
+    expected = DataFrame({"b": Categorical([True, False, None, False])})
+
+    result = parser.read_csv(StringIO(data), dtype=dtype)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_categorical_unexpected_categories(all_parsers):
     parser = all_parsers
     dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}