Skip to content

Commit 97e5ca4

Browse files
committed
Move to parsers
1 parent 1114a86 commit 97e5ca4

File tree

6 files changed

+75
-72
lines changed

6 files changed

+75
-72
lines changed

doc/source/whatsnew/v0.23.0.txt

-1
Original file line numberDiff line numberDiff line change
@@ -1316,7 +1316,6 @@ I/O
13161316
- Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
13171317
- Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`)
13181318
- Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`)
1319-
- Bug in :func:`read_csv` with a ``CategoricalDtype`` with boolean categories not correctly coercing the string values to booleans (:issue:`20498`)
13201319
- Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
13211320
- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
13221321
- Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`)

doc/source/whatsnew/v0.23.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ Bug Fixes
106106

107107
- Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`)
108108
- Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`)
109+
- Bug in :func:`read_csv` with a ``CategoricalDtype`` with boolean categories not correctly coercing the string values to booleans (:issue:`20498`)
109110
-
110111

111112
**Plotting**

pandas/_libs/parsers.pyx

+68-1
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,10 @@ from pandas.core.dtypes.common import (
5050
is_integer_dtype, is_float_dtype,
5151
is_bool_dtype, is_object_dtype,
5252
is_datetime64_dtype,
53+
is_timedelta64_dtype,
5354
pandas_dtype)
5455
from pandas.core.arrays import Categorical
56+
from pandas.core.dtypes.dtypes import CategoricalDtype
5557
from pandas.core.dtypes.concat import union_categoricals
5658
import pandas.io.common as com
5759

@@ -1233,7 +1235,7 @@ cdef class TextReader:
12331235
codes, cats, na_count = _categorical_convert(
12341236
self.parser, i, start, end, na_filter,
12351237
na_hashset, self.c_encoding)
1236-
cat = Categorical._from_inferred_categories(
1238+
cat = sanitize_categorical(
12371239
cats, codes, dtype, true_values=self.true_values)
12381240
return cat, na_count
12391241

@@ -2259,3 +2261,68 @@ def sanitize_objects(ndarray[object] values, set na_values,
22592261
memo[val] = val
22602262

22612263
return na_count
2264+
2265+
2266+
def sanitize_categorical(object inferred_categories,
2267+
object inferred_codes,
2268+
object dtype,
2269+
object true_values=None):
2270+
"""Construct a Categorical from inferred values
2271+
2272+
For inferred categories (`dtype` is None) the categories are sorted.
2273+
For explicit `dtype`, the `inferred_categories` are cast to the
2274+
appropriate type.
2275+
2276+
Parameters
2277+
----------
2278+
2279+
inferred_categories : Index
2280+
inferred_codes : Index
2281+
dtype : CategoricalDtype or 'category'
2282+
true_values : list of bytes, optional
2283+
Uses the default `true_values` defined in parsers.pyx
2284+
by default.
2285+
2286+
Returns
2287+
-------
2288+
Categorical
2289+
"""
2290+
from pandas import Index, to_numeric, to_datetime, to_timedelta
2291+
from pandas._libs.parsers import _true_values
2292+
from pandas.core.arrays.categorical import _recode_for_categories
2293+
2294+
cats = Index(inferred_categories)
2295+
2296+
known_categories = (isinstance(dtype, CategoricalDtype) and
2297+
dtype.categories is not None)
2298+
2299+
if known_categories:
2300+
# Convert to a specialzed type with `dtype` if specified
2301+
if dtype.categories.is_numeric():
2302+
cats = to_numeric(inferred_categories, errors='coerce')
2303+
elif is_datetime64_dtype(dtype.categories):
2304+
cats = to_datetime(inferred_categories, errors='coerce')
2305+
elif is_timedelta64_dtype(dtype.categories):
2306+
cats = to_timedelta(inferred_categories, errors='coerce')
2307+
elif dtype.categories.is_boolean():
2308+
if true_values is None:
2309+
true_values = _true_values
2310+
true_values = [x.decode() for x in true_values]
2311+
cats = cats.isin(true_values)
2312+
2313+
if known_categories:
2314+
# recode from observation order to dtype.categories order
2315+
categories = dtype.categories
2316+
codes = _recode_for_categories(inferred_codes, cats, categories)
2317+
elif not cats.is_monotonic_increasing:
2318+
# sort categories and recode for unknown categories
2319+
unsorted = cats.copy()
2320+
categories = cats.sort_values()
2321+
codes = _recode_for_categories(inferred_codes, unsorted,
2322+
categories)
2323+
dtype = CategoricalDtype(categories, ordered=False)
2324+
else:
2325+
dtype = CategoricalDtype(cats, ordered=False)
2326+
codes = inferred_codes
2327+
2328+
return Categorical(codes, dtype=dtype, fastpath=True)

pandas/core/arrays/categorical.py

-64
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
_ensure_platform_int,
2323
is_dtype_equal,
2424
is_datetimelike,
25-
is_datetime64_dtype,
26-
is_timedelta64_dtype,
2725
is_categorical,
2826
is_categorical_dtype,
2927
is_list_like, is_sequence,
@@ -551,68 +549,6 @@ def base(self):
551549
""" compat, we are always our own object """
552550
return None
553551

554-
@classmethod
555-
def _from_inferred_categories(cls, inferred_categories, inferred_codes,
556-
dtype, true_values=None):
557-
"""Construct a Categorical from inferred values
558-
559-
For inferred categories (`dtype` is None) the categories are sorted.
560-
For explicit `dtype`, the `inferred_categories` are cast to the
561-
appropriate type.
562-
563-
Parameters
564-
----------
565-
566-
inferred_categories : Index
567-
inferred_codes : Index
568-
dtype : CategoricalDtype or 'category'
569-
true_values : list of bytes, optional
570-
Uses the default `true_values` defined in parsers.pyx
571-
by default.
572-
573-
Returns
574-
-------
575-
Categorical
576-
"""
577-
from pandas import Index, to_numeric, to_datetime, to_timedelta
578-
from pandas._libs.parsers import _true_values
579-
580-
cats = Index(inferred_categories)
581-
582-
known_categories = (isinstance(dtype, CategoricalDtype) and
583-
dtype.categories is not None)
584-
585-
if known_categories:
586-
# Convert to a specialzed type with `dtype` if specified
587-
if dtype.categories.is_numeric():
588-
cats = to_numeric(inferred_categories, errors='coerce')
589-
elif is_datetime64_dtype(dtype.categories):
590-
cats = to_datetime(inferred_categories, errors='coerce')
591-
elif is_timedelta64_dtype(dtype.categories):
592-
cats = to_timedelta(inferred_categories, errors='coerce')
593-
elif dtype.categories.is_boolean():
594-
if true_values is None:
595-
true_values = _true_values
596-
true_values = [x.decode() for x in true_values]
597-
cats = cats.isin(true_values)
598-
599-
if known_categories:
600-
# recode from observation order to dtype.categories order
601-
categories = dtype.categories
602-
codes = _recode_for_categories(inferred_codes, cats, categories)
603-
elif not cats.is_monotonic_increasing:
604-
# sort categories and recode for unknown categories
605-
unsorted = cats.copy()
606-
categories = cats.sort_values()
607-
codes = _recode_for_categories(inferred_codes, unsorted,
608-
categories)
609-
dtype = CategoricalDtype(categories, ordered=False)
610-
else:
611-
dtype = CategoricalDtype(cats, ordered=False)
612-
codes = inferred_codes
613-
614-
return cls(codes, dtype=dtype, fastpath=True)
615-
616552
@classmethod
617553
def from_codes(cls, codes, categories, ordered=False):
618554
"""

pandas/io/parsers.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
_ensure_index_from_sequences)
2929
from pandas.core.series import Series
3030
from pandas.core.frame import DataFrame
31-
from pandas.core.arrays import Categorical
3231
from pandas.core import algorithms
3332
import pandas.core.common as com
3433
from pandas.io.date_converters import generic_parser
@@ -1654,7 +1653,7 @@ def _cast_types(self, values, cast_type, column):
16541653
values = astype_nansafe(values, str)
16551654

16561655
cats = Index(values).unique().dropna()
1657-
values = Categorical._from_inferred_categories(
1656+
values = parsers.sanitize_categorical(
16581657
cats, cats.get_indexer(values), cast_type,
16591658
self.true_values
16601659
)

pandas/tests/categorical/test_constructors.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
Interval, IntervalIndex)
1414
from pandas.core.dtypes.dtypes import CategoricalDtype
1515
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
16+
from pandas._libs.parsers import sanitize_categorical
1617

1718

1819
class TestCategoricalConstructors(object):
@@ -468,23 +469,23 @@ def test_from_codes_with_categorical_categories(self):
468469
def test_from_inferred_categories(self, dtype):
469470
cats = ['a', 'b']
470471
codes = np.array([0, 0, 1, 1], dtype='i8')
471-
result = Categorical._from_inferred_categories(cats, codes, dtype)
472+
result = sanitize_categorical(cats, codes, dtype)
472473
expected = Categorical.from_codes(codes, cats)
473474
tm.assert_categorical_equal(result, expected)
474475

475476
@pytest.mark.parametrize('dtype', [None, 'category'])
476477
def test_from_inferred_categories_sorts(self, dtype):
477478
cats = ['b', 'a']
478479
codes = np.array([0, 1, 1, 1], dtype='i8')
479-
result = Categorical._from_inferred_categories(cats, codes, dtype)
480+
result = sanitize_categorical(cats, codes, dtype)
480481
expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
481482
tm.assert_categorical_equal(result, expected)
482483

483484
def test_from_inferred_categories_dtype(self):
484485
cats = ['a', 'b', 'd']
485486
codes = np.array([0, 1, 0, 2], dtype='i8')
486487
dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
487-
result = Categorical._from_inferred_categories(cats, codes, dtype)
488+
result = sanitize_categorical(cats, codes, dtype)
488489
expected = Categorical(['a', 'b', 'a', 'd'],
489490
categories=['c', 'b', 'a'],
490491
ordered=True)
@@ -494,7 +495,7 @@ def test_from_inferred_categories_coerces(self):
494495
cats = ['1', '2', 'bad']
495496
codes = np.array([0, 0, 1, 2], dtype='i8')
496497
dtype = CategoricalDtype([1, 2])
497-
result = Categorical._from_inferred_categories(cats, codes, dtype)
498+
result = sanitize_categorical(cats, codes, dtype)
498499
expected = Categorical([1, 1, 2, np.nan])
499500
tm.assert_categorical_equal(result, expected)
500501

0 commit comments

Comments
 (0)