diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4ff3cc728f7f7..9b32034defda0 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1369,6 +1369,7 @@ Current Behavior: Notice how we now instead output ``np.nan`` itself instead of a stringified form of it. +- Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`) - Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`) - Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`) - Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index f74de79542628..1dc71264c94dd 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1202,7 +1202,20 @@ cdef class TextReader: bint user_dtype, kh_str_t *na_hashset, object na_flist): - if is_integer_dtype(dtype): + if is_categorical_dtype(dtype): + # TODO: I suspect that _categorical_convert could be + # optimized when dtype is an instance of CategoricalDtype + codes, cats, na_count = _categorical_convert( + self.parser, i, start, end, na_filter, + na_hashset, self.c_encoding) + + # Method accepts list of strings, not encoded ones. + true_values = [x.decode() for x in self.true_values] + cat = Categorical._from_inferred_categories( + cats, codes, dtype, true_values=true_values) + return cat, na_count + + elif is_integer_dtype(dtype): try: result, na_count = _try_int64(self.parser, i, start, end, na_filter, na_hashset) @@ -1233,6 +1246,7 @@ cdef class TextReader: na_filter, na_hashset, self.true_set, self.false_set) return result, na_count + elif dtype.kind == 'S': # TODO: na handling width = dtype.itemsize @@ -1252,15 +1266,6 @@ cdef class TextReader: # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) - elif is_categorical_dtype(dtype): - # TODO: I suspect that _categorical_convert could be - # optimized when dtype is an instance of CategoricalDtype - codes, cats, na_count = _categorical_convert( - self.parser, i, start, end, na_filter, - na_hashset, self.c_encoding) - cat = Categorical._from_inferred_categories(cats, codes, dtype) - return cat, na_count - elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6dc3a960dc817..59ad386f797dc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -546,8 +546,9 @@ def base(self): @classmethod def _from_inferred_categories(cls, inferred_categories, inferred_codes, - dtype): - """Construct a Categorical from inferred values + dtype, true_values=None): + """ + Construct a Categorical from inferred values. For inferred categories (`dtype` is None) the categories are sorted. For explicit `dtype`, the `inferred_categories` are cast to the @@ -555,10 +556,12 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, Parameters ---------- - inferred_categories : Index inferred_codes : Index dtype : CategoricalDtype or 'category' + true_values : list, optional + If none are provided, the default ones are + "True", "TRUE", and "true." Returns ------- @@ -567,27 +570,32 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, from pandas import Index, to_numeric, to_datetime, to_timedelta cats = Index(inferred_categories) - known_categories = (isinstance(dtype, CategoricalDtype) and dtype.categories is not None) if known_categories: - # Convert to a specialzed type with `dtype` if specified + # Convert to a specialized type with `dtype` if specified. if dtype.categories.is_numeric(): - cats = to_numeric(inferred_categories, errors='coerce') + cats = to_numeric(inferred_categories, errors="coerce") elif is_datetime64_dtype(dtype.categories): - cats = to_datetime(inferred_categories, errors='coerce') + cats = to_datetime(inferred_categories, errors="coerce") elif is_timedelta64_dtype(dtype.categories): - cats = to_timedelta(inferred_categories, errors='coerce') + cats = to_timedelta(inferred_categories, errors="coerce") + elif dtype.categories.is_boolean(): + if true_values is None: + true_values = ["True", "TRUE", "true"] + + cats = cats.isin(true_values) if known_categories: - # recode from observation order to dtype.categories order + # Recode from observation order to dtype.categories order. categories = dtype.categories codes = _recode_for_categories(inferred_codes, cats, categories) elif not cats.is_monotonic_increasing: - # sort categories and recode for unknown categories + # Sort categories and recode for unknown categories. unsorted = cats.copy() categories = cats.sort_values() + codes = _recode_for_categories(inferred_codes, unsorted, categories) dtype = CategoricalDtype(categories, ordered=False) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index acb9bca2545c0..990709e20f1e0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1745,8 +1745,8 @@ def _cast_types(self, values, cast_type, column): cats = Index(values).unique().dropna() values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type - ) + cats, cats.get_indexer(values), cast_type, + true_values=self.true_values) else: try: diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 17cd0ab16ea61..caa03fc3685f6 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -324,6 +324,22 @@ def test_categorical_coerces_timedelta(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("data", [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", +]) +def test_categorical_dtype_coerces_boolean(all_parsers, data): + # see gh-20498 + parser = all_parsers + dtype = {"b": CategoricalDtype([False, True])} + expected = DataFrame({"b": Categorical([True, False, None, False])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_categorical_unexpected_categories(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}