Skip to content

Commit 5495551

Browse files
committed
BUG: Fixed read_csv with boolean CategoricalDtype
Closes #20498
1 parent def0ee4 commit 5495551

File tree

5 files changed

+20
-6
lines changed

5 files changed

+20
-6
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,7 @@ I/O
11551155
- Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
11561156
- Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`)
11571157
- Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`)
1158+
- Bug in :func:`read_csv` with a ``CategoricalDtype`` with boolean categories not correctly coercing the string values to booleans (:issue:`20498`)
11581159
- Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
11591160
- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
11601161
- Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`)

pandas/_libs/parsers.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -1233,7 +1233,8 @@ cdef class TextReader:
12331233
codes, cats, na_count = _categorical_convert(
12341234
self.parser, i, start, end, na_filter,
12351235
na_hashset, self.c_encoding)
1236-
cat = Categorical._from_inferred_categories(cats, codes, dtype)
1236+
cat = Categorical._from_inferred_categories(cats, codes, dtype,
1237+
true_values=self.true_values)
12371238
return cat, na_count
12381239

12391240
elif is_object_dtype(dtype):

pandas/core/arrays/categorical.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ def base(self):
490490

491491
@classmethod
492492
def _from_inferred_categories(cls, inferred_categories, inferred_codes,
493-
dtype):
493+
dtype, true_values):
494494
"""Construct a Categorical from inferred values
495495
496496
For inferred categories (`dtype` is None) the categories are sorted.
@@ -503,6 +503,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
503503
inferred_categories : Index
504504
inferred_codes : Index
505505
dtype : CategoricalDtype or 'category'
506+
true_values : list of bytes
506507
507508
Returns
508509
-------
@@ -524,7 +525,12 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
524525
elif is_timedelta64_dtype(dtype.categories):
525526
cats = to_timedelta(inferred_categories, errors='coerce')
526527
elif dtype.categories.is_boolean():
527-
cats = inferred_categories == 'True'
528+
# from _true_values in parsers.pyx
529+
# TODO: share declaration with parsers.pyx
530+
if true_values is None:
531+
true_values = [b'True', b'true', b'TRUE']
532+
true_values = [x.decode() for x in true_values]
533+
cats = cats.isin(true_values)
528534

529535
if known_categories:
530536
# recode from observation order to dtype.categories order

pandas/io/parsers.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1654,7 +1654,8 @@ def _cast_types(self, values, cast_type, column):
16541654

16551655
cats = Index(values).unique().dropna()
16561656
values = Categorical._from_inferred_categories(
1657-
cats, cats.get_indexer(values), cast_type
1657+
cats, cats.get_indexer(values), cast_type,
1658+
self.true_values
16581659
)
16591660

16601661
else:

pandas/tests/io/parser/dtypes.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,15 @@ def test_categoricaldtype_coerces_timedelta(self):
229229
result = self.read_csv(StringIO(data), dtype=dtype)
230230
tm.assert_frame_equal(result, expected)
231231

232-
def test_categoricaldtype_coerces_boolen(self):
232+
@pytest.mark.parametrize('data', [
233+
'b\nTrue\nFalse\nNA\nFalse',
234+
'b\ntrue\nfalse\nNA\nfalse',
235+
'b\nTRUE\nFALSE\nNA\nFALSE',
236+
'b\nTrue\nFalse\nNA\nFALSE',
237+
])
238+
def test_categoricaldtype_coerces_boolen(self, data):
233239
# 20498
234240
dtype = {'b': CategoricalDtype([False, True])}
235-
data = "b\nTrue\nFalse\nNA\nFalse"
236241
expected = pd.DataFrame({"b": Categorical([True, False, None, False])})
237242
result = self.read_csv(StringIO(data), dtype=dtype)
238243
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)