Skip to content

Commit 3f967dc

Browse files
TomAugspurgerPingviinituutti
authored andcommitted
BUG: Properly handle CSV boolean CategoricalDtype (pandas-dev#20826)
Previously, was being parsed as object instead of boolean. Closes pandas-devgh-20498. Original Author: @TomAugspurger Rebased by @gfyoung due to merge conflicts.
1 parent 02bd3e2 commit 3f967dc

File tree

5 files changed

+52
-22
lines changed

5 files changed

+52
-22
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1375,6 +1375,7 @@ Current Behavior:
13751375
13761376
Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
13771377

1378+
- Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`)
13781379
- Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
13791380
- Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
13801381
- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`)

pandas/_libs/parsers.pyx

+15-10
Original file line numberDiff line numberDiff line change
@@ -1202,7 +1202,20 @@ cdef class TextReader:
12021202
bint user_dtype,
12031203
kh_str_t *na_hashset,
12041204
object na_flist):
1205-
if is_integer_dtype(dtype):
1205+
if is_categorical_dtype(dtype):
1206+
# TODO: I suspect that _categorical_convert could be
1207+
# optimized when dtype is an instance of CategoricalDtype
1208+
codes, cats, na_count = _categorical_convert(
1209+
self.parser, i, start, end, na_filter,
1210+
na_hashset, self.c_encoding)
1211+
1212+
# Method accepts list of strings, not encoded ones.
1213+
true_values = [x.decode() for x in self.true_values]
1214+
cat = Categorical._from_inferred_categories(
1215+
cats, codes, dtype, true_values=true_values)
1216+
return cat, na_count
1217+
1218+
elif is_integer_dtype(dtype):
12061219
try:
12071220
result, na_count = _try_int64(self.parser, i, start,
12081221
end, na_filter, na_hashset)
@@ -1233,6 +1246,7 @@ cdef class TextReader:
12331246
na_filter, na_hashset,
12341247
self.true_set, self.false_set)
12351248
return result, na_count
1249+
12361250
elif dtype.kind == 'S':
12371251
# TODO: na handling
12381252
width = dtype.itemsize
@@ -1252,15 +1266,6 @@ cdef class TextReader:
12521266
# unicode variable width
12531267
return self._string_convert(i, start, end, na_filter,
12541268
na_hashset)
1255-
elif is_categorical_dtype(dtype):
1256-
# TODO: I suspect that _categorical_convert could be
1257-
# optimized when dtype is an instance of CategoricalDtype
1258-
codes, cats, na_count = _categorical_convert(
1259-
self.parser, i, start, end, na_filter,
1260-
na_hashset, self.c_encoding)
1261-
cat = Categorical._from_inferred_categories(cats, codes, dtype)
1262-
return cat, na_count
1263-
12641269
elif is_object_dtype(dtype):
12651270
return self._string_convert(i, start, end, na_filter,
12661271
na_hashset)

pandas/core/arrays/categorical.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -546,19 +546,22 @@ def base(self):
546546

547547
@classmethod
548548
def _from_inferred_categories(cls, inferred_categories, inferred_codes,
549-
dtype):
550-
"""Construct a Categorical from inferred values
549+
dtype, true_values=None):
550+
"""
551+
Construct a Categorical from inferred values.
551552
552553
For inferred categories (`dtype` is None) the categories are sorted.
553554
For explicit `dtype`, the `inferred_categories` are cast to the
554555
appropriate type.
555556
556557
Parameters
557558
----------
558-
559559
inferred_categories : Index
560560
inferred_codes : Index
561561
dtype : CategoricalDtype or 'category'
562+
true_values : list, optional
563+
If none are provided, the default ones are
564+
"True", "TRUE", and "true."
562565
563566
Returns
564567
-------
@@ -567,27 +570,32 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
567570
from pandas import Index, to_numeric, to_datetime, to_timedelta
568571

569572
cats = Index(inferred_categories)
570-
571573
known_categories = (isinstance(dtype, CategoricalDtype) and
572574
dtype.categories is not None)
573575

574576
if known_categories:
575-
# Convert to a specialzed type with `dtype` if specified
577+
# Convert to a specialized type with `dtype` if specified.
576578
if dtype.categories.is_numeric():
577-
cats = to_numeric(inferred_categories, errors='coerce')
579+
cats = to_numeric(inferred_categories, errors="coerce")
578580
elif is_datetime64_dtype(dtype.categories):
579-
cats = to_datetime(inferred_categories, errors='coerce')
581+
cats = to_datetime(inferred_categories, errors="coerce")
580582
elif is_timedelta64_dtype(dtype.categories):
581-
cats = to_timedelta(inferred_categories, errors='coerce')
583+
cats = to_timedelta(inferred_categories, errors="coerce")
584+
elif dtype.categories.is_boolean():
585+
if true_values is None:
586+
true_values = ["True", "TRUE", "true"]
587+
588+
cats = cats.isin(true_values)
582589

583590
if known_categories:
584-
# recode from observation order to dtype.categories order
591+
# Recode from observation order to dtype.categories order.
585592
categories = dtype.categories
586593
codes = _recode_for_categories(inferred_codes, cats, categories)
587594
elif not cats.is_monotonic_increasing:
588-
# sort categories and recode for unknown categories
595+
# Sort categories and recode for unknown categories.
589596
unsorted = cats.copy()
590597
categories = cats.sort_values()
598+
591599
codes = _recode_for_categories(inferred_codes, unsorted,
592600
categories)
593601
dtype = CategoricalDtype(categories, ordered=False)

pandas/io/parsers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1752,8 +1752,8 @@ def _cast_types(self, values, cast_type, column):
17521752

17531753
cats = Index(values).unique().dropna()
17541754
values = Categorical._from_inferred_categories(
1755-
cats, cats.get_indexer(values), cast_type
1756-
)
1755+
cats, cats.get_indexer(values), cast_type,
1756+
true_values=self.true_values)
17571757

17581758
else:
17591759
try:

pandas/tests/io/parser/test_dtypes.py

+16
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,22 @@ def test_categorical_coerces_timedelta(all_parsers):
324324
tm.assert_frame_equal(result, expected)
325325

326326

327+
@pytest.mark.parametrize("data", [
328+
"b\nTrue\nFalse\nNA\nFalse",
329+
"b\ntrue\nfalse\nNA\nfalse",
330+
"b\nTRUE\nFALSE\nNA\nFALSE",
331+
"b\nTrue\nFalse\nNA\nFALSE",
332+
])
333+
def test_categorical_dtype_coerces_boolean(all_parsers, data):
334+
# see gh-20498
335+
parser = all_parsers
336+
dtype = {"b": CategoricalDtype([False, True])}
337+
expected = DataFrame({"b": Categorical([True, False, None, False])})
338+
339+
result = parser.read_csv(StringIO(data), dtype=dtype)
340+
tm.assert_frame_equal(result, expected)
341+
342+
327343
def test_categorical_unexpected_categories(all_parsers):
328344
parser = all_parsers
329345
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}

0 commit comments

Comments
 (0)