Move to parsers

TomAugspurger · TomAugspurger · commit 97e5ca48a341 · 2018-07-05T12:13:50.000-05:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1316,7 +1316,6 @@ I/O
 - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
 - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`)
 - Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`)
-- Bug in :func:`read_csv` with a ``CategoricalDtype`` with boolean categories not correctly coercing the string values to booleans (:issue:`20498`)
 - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
 - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
 - Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`)
diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt
@@ -106,6 +106,7 @@ Bug Fixes
 
 - Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`)
 - Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`)
+- Bug in :func:`read_csv` with a ``CategoricalDtype`` with boolean categories not correctly coercing the string values to booleans (:issue:`20498`)
 -
 
 **Plotting**
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -50,8 +50,10 @@ from pandas.core.dtypes.common import (
     is_integer_dtype, is_float_dtype,
     is_bool_dtype, is_object_dtype,
     is_datetime64_dtype,
+    is_timedelta64_dtype,
     pandas_dtype)
 from pandas.core.arrays import Categorical
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.concat import union_categoricals
 import pandas.io.common as com
 
@@ -1233,7 +1235,7 @@ cdef class TextReader:
             codes, cats, na_count = _categorical_convert(
                 self.parser, i, start, end, na_filter,
                 na_hashset, self.c_encoding)
-            cat = Categorical._from_inferred_categories(
+            cat = sanitize_categorical(
                 cats, codes, dtype, true_values=self.true_values)
             return cat, na_count
 
@@ -2259,3 +2261,68 @@ def sanitize_objects(ndarray[object] values, set na_values,
             memo[val] = val
 
     return na_count
+
+
+def sanitize_categorical(object inferred_categories,
+                         object inferred_codes,
+                         object dtype,
+                         object true_values=None):
+    """Construct a Categorical from inferred values
+
+    For inferred categories (`dtype` is None) the categories are sorted.
+    For explicit `dtype`, the `inferred_categories` are cast to the
+    appropriate type.
+
+    Parameters
+    ----------
+
+    inferred_categories : Index
+    inferred_codes : Index
+    dtype : CategoricalDtype or 'category'
+    true_values : list of bytes, optional
+        Uses the default `true_values` defined in parsers.pyx
+        by default.
+
+    Returns
+    -------
+    Categorical
+    """
+    from pandas import Index, to_numeric, to_datetime, to_timedelta
+    from pandas._libs.parsers import _true_values
+    from pandas.core.arrays.categorical import _recode_for_categories
+
+    cats = Index(inferred_categories)
+
+    known_categories = (isinstance(dtype, CategoricalDtype) and
+                        dtype.categories is not None)
+
+    if known_categories:
+        # Convert to a specialzed type with `dtype` if specified
+        if dtype.categories.is_numeric():
+            cats = to_numeric(inferred_categories, errors='coerce')
+        elif is_datetime64_dtype(dtype.categories):
+            cats = to_datetime(inferred_categories, errors='coerce')
+        elif is_timedelta64_dtype(dtype.categories):
+            cats = to_timedelta(inferred_categories, errors='coerce')
+        elif dtype.categories.is_boolean():
+            if true_values is None:
+                true_values = _true_values
+            true_values = [x.decode() for x in true_values]
+            cats = cats.isin(true_values)
+
+    if known_categories:
+        # recode from observation order to dtype.categories order
+        categories = dtype.categories
+        codes = _recode_for_categories(inferred_codes, cats, categories)
+    elif not cats.is_monotonic_increasing:
+        # sort categories and recode for unknown categories
+        unsorted = cats.copy()
+        categories = cats.sort_values()
+        codes = _recode_for_categories(inferred_codes, unsorted,
+                                        categories)
+        dtype = CategoricalDtype(categories, ordered=False)
+    else:
+        dtype = CategoricalDtype(cats, ordered=False)
+        codes = inferred_codes
+
+    return Categorical(codes, dtype=dtype, fastpath=True)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -22,8 +22,6 @@
     _ensure_platform_int,
     is_dtype_equal,
     is_datetimelike,
-    is_datetime64_dtype,
-    is_timedelta64_dtype,
     is_categorical,
     is_categorical_dtype,
     is_list_like, is_sequence,
@@ -551,68 +549,6 @@ def base(self):
         """ compat, we are always our own object """
         return None
 
-    @classmethod
-    def _from_inferred_categories(cls, inferred_categories, inferred_codes,
-                                  dtype, true_values=None):
-        """Construct a Categorical from inferred values
-
-        For inferred categories (`dtype` is None) the categories are sorted.
-        For explicit `dtype`, the `inferred_categories` are cast to the
-        appropriate type.
-
-        Parameters
-        ----------
-
-        inferred_categories : Index
-        inferred_codes : Index
-        dtype : CategoricalDtype or 'category'
-        true_values : list of bytes, optional
-            Uses the default `true_values` defined in parsers.pyx
-            by default.
-
-        Returns
-        -------
-        Categorical
-        """
-        from pandas import Index, to_numeric, to_datetime, to_timedelta
-        from pandas._libs.parsers import _true_values
-
-        cats = Index(inferred_categories)
-
-        known_categories = (isinstance(dtype, CategoricalDtype) and
-                            dtype.categories is not None)
-
-        if known_categories:
-            # Convert to a specialzed type with `dtype` if specified
-            if dtype.categories.is_numeric():
-                cats = to_numeric(inferred_categories, errors='coerce')
-            elif is_datetime64_dtype(dtype.categories):
-                cats = to_datetime(inferred_categories, errors='coerce')
-            elif is_timedelta64_dtype(dtype.categories):
-                cats = to_timedelta(inferred_categories, errors='coerce')
-            elif dtype.categories.is_boolean():
-                if true_values is None:
-                    true_values = _true_values
-                true_values = [x.decode() for x in true_values]
-                cats = cats.isin(true_values)
-
-        if known_categories:
-            # recode from observation order to dtype.categories order
-            categories = dtype.categories
-            codes = _recode_for_categories(inferred_codes, cats, categories)
-        elif not cats.is_monotonic_increasing:
-            # sort categories and recode for unknown categories
-            unsorted = cats.copy()
-            categories = cats.sort_values()
-            codes = _recode_for_categories(inferred_codes, unsorted,
-                                           categories)
-            dtype = CategoricalDtype(categories, ordered=False)
-        else:
-            dtype = CategoricalDtype(cats, ordered=False)
-            codes = inferred_codes
-
-        return cls(codes, dtype=dtype, fastpath=True)
-
     @classmethod
     def from_codes(cls, codes, categories, ordered=False):
         """
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -28,7 +28,6 @@
                                _ensure_index_from_sequences)
 from pandas.core.series import Series
 from pandas.core.frame import DataFrame
-from pandas.core.arrays import Categorical
 from pandas.core import algorithms
 import pandas.core.common as com
 from pandas.io.date_converters import generic_parser
@@ -1654,7 +1653,7 @@ def _cast_types(self, values, cast_type, column):
                 values = astype_nansafe(values, str)
 
             cats = Index(values).unique().dropna()
-            values = Categorical._from_inferred_categories(
+            values = parsers.sanitize_categorical(
                 cats, cats.get_indexer(values), cast_type,
                 self.true_values
             )
diff --git a/pandas/tests/categorical/test_constructors.py b/pandas/tests/categorical/test_constructors.py
@@ -13,6 +13,7 @@
                     Interval, IntervalIndex)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
+from pandas._libs.parsers import sanitize_categorical
 
 
 class TestCategoricalConstructors(object):
@@ -468,23 +469,23 @@ def test_from_codes_with_categorical_categories(self):
     def test_from_inferred_categories(self, dtype):
         cats = ['a', 'b']
         codes = np.array([0, 0, 1, 1], dtype='i8')
-        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        result = sanitize_categorical(cats, codes, dtype)
         expected = Categorical.from_codes(codes, cats)
         tm.assert_categorical_equal(result, expected)
 
     @pytest.mark.parametrize('dtype', [None, 'category'])
     def test_from_inferred_categories_sorts(self, dtype):
         cats = ['b', 'a']
         codes = np.array([0, 1, 1, 1], dtype='i8')
-        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        result = sanitize_categorical(cats, codes, dtype)
         expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b'])
         tm.assert_categorical_equal(result, expected)
 
     def test_from_inferred_categories_dtype(self):
         cats = ['a', 'b', 'd']
         codes = np.array([0, 1, 0, 2], dtype='i8')
         dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True)
-        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        result = sanitize_categorical(cats, codes, dtype)
         expected = Categorical(['a', 'b', 'a', 'd'],
                                categories=['c', 'b', 'a'],
                                ordered=True)
@@ -494,7 +495,7 @@ def test_from_inferred_categories_coerces(self):
         cats = ['1', '2', 'bad']
         codes = np.array([0, 0, 1, 2], dtype='i8')
         dtype = CategoricalDtype([1, 2])
-        result = Categorical._from_inferred_categories(cats, codes, dtype)
+        result = sanitize_categorical(cats, codes, dtype)
         expected = Categorical([1, 1, 2, np.nan])
         tm.assert_categorical_equal(result, expected)
 

Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,7 @@ Bug Fixes`
`106`	`106`
`107`	`107`	- Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`)
`108`	`108`	- Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`)
	`109`	+- Bug in :func:`read_csv` with a ``CategoricalDtype`` with boolean categories not correctly coercing the string values to booleans (:issue:`20498`)
`109`	`110`	`-`
`110`	`111`
`111`	`112`	`Plotting`