Skip to content

Fixed read_csv with CategoricalDtype with boolean categories (20498) #20826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1369,6 +1369,7 @@ Current Behavior:

Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.

- Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`)
- Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
- Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`)
Expand Down
25 changes: 15 additions & 10 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1202,7 +1202,20 @@ cdef class TextReader:
bint user_dtype,
kh_str_t *na_hashset,
object na_flist):
if is_integer_dtype(dtype):
if is_categorical_dtype(dtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
codes, cats, na_count = _categorical_convert(
self.parser, i, start, end, na_filter,
na_hashset, self.c_encoding)

# Method accepts list of strings, not encoded ones.
true_values = [x.decode() for x in self.true_values]
cat = Categorical._from_inferred_categories(
cats, codes, dtype, true_values=true_values)
return cat, na_count

elif is_integer_dtype(dtype):
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
Expand Down Expand Up @@ -1233,6 +1246,7 @@ cdef class TextReader:
na_filter, na_hashset,
self.true_set, self.false_set)
return result, na_count

elif dtype.kind == 'S':
# TODO: na handling
width = dtype.itemsize
Expand All @@ -1252,15 +1266,6 @@ cdef class TextReader:
# unicode variable width
return self._string_convert(i, start, end, na_filter,
na_hashset)
elif is_categorical_dtype(dtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
codes, cats, na_count = _categorical_convert(
self.parser, i, start, end, na_filter,
na_hashset, self.c_encoding)
cat = Categorical._from_inferred_categories(cats, codes, dtype)
return cat, na_count

elif is_object_dtype(dtype):
return self._string_convert(i, start, end, na_filter,
na_hashset)
Expand Down
28 changes: 18 additions & 10 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,19 +546,22 @@ def base(self):

@classmethod
def _from_inferred_categories(cls, inferred_categories, inferred_codes,
dtype):
"""Construct a Categorical from inferred values
dtype, true_values=None):
"""
Construct a Categorical from inferred values.

For inferred categories (`dtype` is None) the categories are sorted.
For explicit `dtype`, the `inferred_categories` are cast to the
appropriate type.

Parameters
----------

inferred_categories : Index
inferred_codes : Index
dtype : CategoricalDtype or 'category'
true_values : list, optional
If none are provided, the default ones are
"True", "TRUE", and "true."

Returns
-------
Expand All @@ -567,27 +570,32 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
from pandas import Index, to_numeric, to_datetime, to_timedelta

cats = Index(inferred_categories)

known_categories = (isinstance(dtype, CategoricalDtype) and
dtype.categories is not None)

if known_categories:
# Convert to a specialzed type with `dtype` if specified
# Convert to a specialized type with `dtype` if specified.
if dtype.categories.is_numeric():
cats = to_numeric(inferred_categories, errors='coerce')
cats = to_numeric(inferred_categories, errors="coerce")
elif is_datetime64_dtype(dtype.categories):
cats = to_datetime(inferred_categories, errors='coerce')
cats = to_datetime(inferred_categories, errors="coerce")
elif is_timedelta64_dtype(dtype.categories):
cats = to_timedelta(inferred_categories, errors='coerce')
cats = to_timedelta(inferred_categories, errors="coerce")
elif dtype.categories.is_boolean():
if true_values is None:
true_values = ["True", "TRUE", "true"]

cats = cats.isin(true_values)

if known_categories:
# recode from observation order to dtype.categories order
# Recode from observation order to dtype.categories order.
categories = dtype.categories
codes = _recode_for_categories(inferred_codes, cats, categories)
elif not cats.is_monotonic_increasing:
# sort categories and recode for unknown categories
# Sort categories and recode for unknown categories.
unsorted = cats.copy()
categories = cats.sort_values()

codes = _recode_for_categories(inferred_codes, unsorted,
categories)
dtype = CategoricalDtype(categories, ordered=False)
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1745,8 +1745,8 @@ def _cast_types(self, values, cast_type, column):

cats = Index(values).unique().dropna()
values = Categorical._from_inferred_categories(
cats, cats.get_indexer(values), cast_type
)
cats, cats.get_indexer(values), cast_type,
true_values=self.true_values)

else:
try:
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/io/parser/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,22 @@ def test_categorical_coerces_timedelta(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("data", [
"b\nTrue\nFalse\nNA\nFalse",
"b\ntrue\nfalse\nNA\nfalse",
"b\nTRUE\nFALSE\nNA\nFALSE",
"b\nTrue\nFalse\nNA\nFALSE",
])
def test_categorical_dtype_coerces_boolean(all_parsers, data):
# see gh-20498
parser = all_parsers
dtype = {"b": CategoricalDtype([False, True])}
expected = DataFrame({"b": Categorical([True, False, None, False])})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_unexpected_categories(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
Expand Down