BUG: Fixed read_csv with boolean CategoricalDtype

TomAugspurger · TomAugspurger · commit 5495551e1797 · 2018-04-25T16:57:17.000-05:00
Closes #20498
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1155,6 +1155,7 @@ I/O
 - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
 - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`)
 - Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`)
+- Bug in :func:`read_csv` with a ``CategoricalDtype`` with boolean categories not correctly coercing the string values to booleans (:issue:`20498`)
 - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
 - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
 - Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1233,7 +1233,8 @@ cdef class TextReader:
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
-            cat = Categorical._from_inferred_categories(cats, codes, dtype)
+            cat = Categorical._from_inferred_categories(cats, codes, dtype,
+                                                        true_values=self.true_values)
             return cat, na_count
 
         elif is_object_dtype(dtype):
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -490,7 +490,7 @@ def base(self):
 
     @classmethod
     def _from_inferred_categories(cls, inferred_categories, inferred_codes,
-                                  dtype):
+                                  dtype, true_values):
         """Construct a Categorical from inferred values
 
         For inferred categories (`dtype` is None) the categories are sorted.
@@ -503,6 +503,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
         inferred_categories : Index
         inferred_codes : Index
         dtype : CategoricalDtype or 'category'
+        true_values : list of bytes
 
         Returns
         -------
@@ -524,7 +525,12 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
             elif is_timedelta64_dtype(dtype.categories):
                 cats = to_timedelta(inferred_categories, errors='coerce')
             elif dtype.categories.is_boolean():
-                cats = inferred_categories == 'True'
+                # from _true_values in parsers.pyx
+                # TODO: share declaration with parsers.pyx
+                if true_values is None:
+                    true_values = [b'True', b'true', b'TRUE']
+                true_values = [x.decode() for x in true_values]
+                cats = cats.isin(true_values)
 
         if known_categories:
             # recode from observation order to dtype.categories order
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1654,7 +1654,8 @@ def _cast_types(self, values, cast_type, column):
 
             cats = Index(values).unique().dropna()
             values = Categorical._from_inferred_categories(
-                cats, cats.get_indexer(values), cast_type
+                cats, cats.get_indexer(values), cast_type,
+                self.true_values
             )
 
         else:
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -229,10 +229,15 @@ def test_categoricaldtype_coerces_timedelta(self):
         result = self.read_csv(StringIO(data), dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
-    def test_categoricaldtype_coerces_boolen(self):
+    @pytest.mark.parametrize('data', [
+        'b\nTrue\nFalse\nNA\nFalse',
+        'b\ntrue\nfalse\nNA\nfalse',
+        'b\nTRUE\nFALSE\nNA\nFALSE',
+        'b\nTrue\nFalse\nNA\nFALSE',
+    ])
+    def test_categoricaldtype_coerces_boolen(self, data):
         # 20498
         dtype = {'b': CategoricalDtype([False, True])}
-        data = "b\nTrue\nFalse\nNA\nFalse"
         expected = pd.DataFrame({"b": Categorical([True, False, None, False])})
         result = self.read_csv(StringIO(data), dtype=dtype)
         tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -1654,7 +1654,8 @@ def _cast_types(self, values, cast_type, column):`
`1654`	`1654`
`1655`	`1655`	`cats = Index(values).unique().dropna()`
`1656`	`1656`	`values = Categorical._from_inferred_categories(`
`1657`		`- cats, cats.get_indexer(values), cast_type`
	`1657`	`+ cats, cats.get_indexer(values), cast_type,`
	`1658`	`+ self.true_values`
`1658`	`1659`	`)`
`1659`	`1660`
`1660`	`1661`	`else:`