From 286d90764bcaceb2a2cde3c637323611439b94b0 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Wed, 8 Jun 2016 18:26:28 -0500
Subject: [PATCH 01/11] ENH: parse categoricals in read_csv

---
 pandas/io/tests/parser/c_parser_only.py |  23 ++-
 pandas/parser.pyx                       | 258 ++++++++++++++++++++----
 2 files changed, 242 insertions(+), 39 deletions(-)

diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
index 103c9fa2b7ce8..9f264cadfa473 100644
--- a/pandas/io/tests/parser/c_parser_only.py
+++ b/pandas/io/tests/parser/c_parser_only.py
@@ -12,9 +12,10 @@
 
 import pandas as pd
 import pandas.util.testing as tm
-from pandas import DataFrame, Series, Index, MultiIndex
+from pandas import DataFrame, Series, Index, MultiIndex, Categorical
 from pandas import compat
 from pandas.compat import StringIO, range, lrange
+from pandas.types.dtypes import CategoricalDtype
 
 
 class CParserTests(object):
@@ -184,6 +185,26 @@ def test_pass_dtype(self):
         self.assertEqual(result['one'].dtype, 'u1')
         self.assertEqual(result['two'].dtype, 'object')
 
+    def test_categorical_dtype(self):
+        # GH 10153
+        data = """a,b,c
+1,a,3.4
+1,a,3.4
+2,b,4.5"""
+        expected = pd.DataFrame({'a': Categorical([1, 1, 2]),
+                                 'b': Categorical(['a', 'a', 'b']),
+                                 'c': Categorical([3.4, 3.4, 4.5])})
+        actual = self.read_csv(StringIO(data), dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
+        actual = self.read_csv(StringIO(data), dtype=CategoricalDtype())
+        tm.assert_frame_equal(actual, expected)
+
+        actual = self.read_csv(StringIO(data), dtype={'a': 'category',
+                                                      'b': 'category',
+                                                      'c': CategoricalDtype()})
+        tm.assert_frame_equal(actual, expected)
+
     def test_pass_dtype_as_recarray(self):
         if compat.is_platform_windows() and self.low_memory:
             raise nose.SkipTest(
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index e72e2f90a5213..72801c631070b 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -25,6 +25,7 @@ cdef extern from "Python.h":
 cdef extern from "stdlib.h":
     void memcpy(void *dst, void *src, size_t n)
 
+cimport cython
 cimport numpy as cnp
 
 from numpy cimport ndarray, uint8_t, uint64_t
@@ -33,6 +34,9 @@ import numpy as np
 cimport util
 
 import pandas.lib as lib
+from pandas.core.common import is_categorical_dtype, CategoricalDtype
+from pandas.core.categorical import Categorical
+from pandas.types.concat import union_categoricals
 
 import time
 import os
@@ -220,6 +224,24 @@ cdef extern from "parser/tokenizer.h":
     int to_boolean(const char *item, uint8_t *val) nogil
 
 
+# XXX
+# this is a hack - in order to make the inference
+# functions generic (converting either data directly
+# from the parser or from a passed in hash table)
+# we add an "optional" parameter via fused type, that can either
+# be the hash table to parse, or an integer, which is used
+# as a sentinel to specialize the function for reading
+# from the parser.
+
+# This is to avoid duplicating a bunch of code or
+# adding runtime checks, but may be too much
+ctypedef kh_str_t* kh_str_t_p
+ctypedef int use_parser_data
+
+ctypedef fused inference_data_t:
+    kh_str_t_p
+    use_parser_data
+
 cdef extern from "parser/io.h":
     void *new_mmap(char *fname)
     int del_mmap(void *src)
@@ -475,12 +497,17 @@ cdef class TextReader:
             conv = {}
             for k in dtype:
                 v = dtype[k]
-                if isinstance(v, basestring):
+                if is_categorical_dtype(v):
+                    v = CategoricalDtype()
+                elif isinstance(v, basestring):
                     v = np.dtype(v)
                 conv[k] = v
             dtype = conv
         elif dtype is not None:
-            dtype = np.dtype(dtype)
+            if is_categorical_dtype(dtype):
+                dtype = CategoricalDtype()
+            else:
+                dtype = np.dtype(dtype)
 
         self.dtype = dtype
 
@@ -1082,7 +1109,7 @@ cdef class TextReader:
 
             if col_dtype is not None:
                 if not isinstance(col_dtype, basestring):
-                    if isinstance(col_dtype, np.dtype):
+                    if isinstance(col_dtype, np.dtype) or is_categorical_dtype(col_dtype):
                         col_dtype = col_dtype.str
                     else:
                         col_dtype = np.dtype(col_dtype).str
@@ -1138,11 +1165,12 @@ cdef class TextReader:
                              object na_flist):
         if dtype[1] == 'i' or dtype[1] == 'u':
             result, na_count = _try_int64(self.parser, i, start, end,
-                                          na_filter, na_hashset)
+                                          na_filter, na_hashset,
+                                          <use_parser_data>NULL)
             if user_dtype and na_count is not None:
                 if na_count > 0:
                     raise ValueError("Integer column has NA values in "
-                                    "column {column}".format(column=i))
+                                     "column {column}".format(column=i))
 
             if result is not None and dtype[1:] != 'i8':
                 result = result.astype(dtype)
@@ -1151,7 +1179,8 @@ cdef class TextReader:
 
         elif dtype[1] == 'f':
             result, na_count = _try_double(self.parser, i, start, end,
-                                           na_filter, na_hashset, na_flist)
+                                           na_filter, na_hashset, na_flist,
+                                           <use_parser_data>NULL)
 
             if result is not None and dtype[1:] != 'f8':
                 result = result.astype(dtype)
@@ -1160,7 +1189,8 @@ cdef class TextReader:
         elif dtype[1] == 'b':
             result, na_count = _try_bool_flex(self.parser, i, start, end,
                                               na_filter, na_hashset,
-                                              self.true_set, self.false_set)
+                                              self.true_set, self.false_set,
+                                              <use_parser_data>NULL)
             return result, na_count
         elif dtype[1] == 'c':
             raise NotImplementedError("the dtype %s is not supported for parsing" % dtype)
@@ -1183,8 +1213,15 @@ cdef class TextReader:
             # unicode variable width
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
-
-
+        # is this comparison good enough?
+        elif dtype == '|O08':
+            codes, cats, na_count = _categorical_convert(self.parser, i, start,
+                                                         end, na_filter, na_hashset,
+                                                         na_flist, self.true_set,
+                                                         self.false_set, self.c_encoding)
+
+            return Categorical(codes, categories=cats, ordered=False,
+                               fastpath=True), na_count
         elif dtype[1] == 'O':
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
@@ -1500,6 +1537,97 @@ cdef _string_box_decode(parser_t *parser, int col,
 
     return result, na_count
 
+@cython.boundscheck(False)
+cdef _categorical_convert(parser_t *parser, int col,
+                          int line_start, int line_end,
+                          bint na_filter, kh_str_t *na_hashset,
+                          object na_flist, const kh_str_t *true_hashset,
+                          const kh_str_t *false_hashset,
+                          char *encoding):
+    "Convert column data into codes, categories"
+    cdef:
+        int error, na_count = 0
+        Py_ssize_t i, size
+        size_t lines
+        coliter_t it
+        const char *word = NULL
+        int64_t NA = -1
+        int64_t[:] codes
+        size_t current_category = 0
+
+        char *errors = "strict"
+
+        int ret = 0
+        kh_str_t *table
+
+        khiter_t k
+
+    lines = line_end - line_start
+    codes = np.empty(lines, dtype=np.int64)
+    with nogil:
+        table = kh_init_str()
+        coliter_setup(&it, parser, col, line_start)
+
+        for i in range(lines):
+            COLITER_NEXT(it, word)
+
+            if na_filter:
+                k = kh_get_str(na_hashset, word)
+                # is in NA values
+                if k != na_hashset.n_buckets:
+                    na_count += 1
+                    codes[i] = NA
+                    continue
+
+            k = kh_get_str(table, word)
+            # not in the hash table
+            if k == table.n_buckets:
+                k = kh_put_str(table, word, &ret)
+                table.vals[k] = current_category
+                current_category += 1
+
+            codes[i] = table.vals[k]
+
+
+    # follow the same inference attempts as
+    # normal data (int64, float64, bool, object)
+    result, result_na = _try_int64(parser, col, 0, table.n_occupied,
+                                   na_filter, na_hashset, table)
+    if result is None:
+        result, result_na = _try_double(parser, col, 0, table.n_occupied,
+                                        na_filter, na_hashset, na_flist,
+                                        table)
+    if result is None:
+        # bool categorical doesn't really make sense, but following the
+        # inference path for now
+        result, result_na = _try_bool_flex(parser, col, 0, table.n_occupied,
+                                           na_filter, na_hashset, true_hashset,
+                                           false_hashset, table)
+    # duplicated logic here, but doesn't make sense to reuse
+    # other string logic since those paths factorize where we
+    # already have guaranteed uniques
+    if result is None:
+        i = 0
+        result = np.empty(table.n_occupied, dtype=np.object_)
+        if encoding != NULL and encoding != b"utf-8":
+            for k in range(table.n_buckets):
+                if kh_exist_str(table, k):
+                    size = strlen(table.keys[k])
+                    result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
+                    i += 1
+        elif PY3 or encoding != NULL:
+            for k in range(table.n_buckets):
+                if kh_exist_str(table, k):
+                    result[i] = PyUnicode_FromString(table.keys[k])
+                    i += 1
+        else:
+            for k in range(table.n_buckets):
+                if kh_exist_str(table, k):
+                    result[i] = PyBytes_FromString(table.keys[k])
+                    i += 1
+
+    kh_destroy_str(table)
+    return np.asarray(codes), result, na_count
 
 cdef _to_fw_string(parser_t *parser, int col, int line_start,
                    int line_end, size_t width):
@@ -1536,12 +1664,12 @@ cdef char* cinf = b'inf'
 cdef char* cposinf = b'+inf'
 cdef char* cneginf = b'-inf'
 
-cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
-                 bint na_filter, kh_str_t *na_hashset, object na_flist):
+cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end,
+                        bint na_filter, kh_str_t *na_hashset, object na_flist,
+                        inference_data_t inference_data):
     cdef:
         int error, na_count = 0
         size_t i, lines
-        coliter_t it
         const char *word = NULL
         char *p_end
         double *data
@@ -1557,17 +1685,19 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
     na_fset = kset_float64_from_list(na_flist)
     with nogil:
         error = _try_double_nogil(parser, col, line_start, line_end,
-                                  na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count)
+                                  na_filter, na_hashset, use_na_flist, na_fset, NA, data,
+                                  &na_count, inference_data)
     kh_destroy_float64(na_fset)
     if error != 0:
         return None, None
     return result, na_count
 
+
 cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end,
-                                  bint na_filter, kh_str_t *na_hashset, bint use_na_flist,
-                                  const kh_float64_t *na_flist,
-                                  double NA,
-                                  double *data, int *na_count) nogil:
+                           bint na_filter, kh_str_t *na_hashset, bint use_na_flist,
+                           const kh_float64_t *na_flist,
+                           double NA, double *data, int *na_count,
+                           inference_data_t inference_data) nogil:
     cdef:
         int error,
         size_t i
@@ -1576,15 +1706,24 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
         const char *word = NULL
         char *p_end
         khiter_t k, k64
+        # only used with passed in data
+        khiter_t kit = 0
 
     global errno
 
     na_count[0] = 0
-    coliter_setup(&it, parser, col, line_start)
+
+    # these type checks specialize at compile time
+    # see typedefs
+    if inference_data_t is use_parser_data:
+        coliter_setup(&it, parser, col, line_start)
 
     if na_filter:
         for i in range(lines):
-            COLITER_NEXT(it, word)
+            if inference_data_t is use_parser_data:
+                COLITER_NEXT(it, word)
+            else:
+                kit = _htable_next(inference_data, kit, &word)
 
             k = kh_get_str(na_hashset, word)
             # in the hash table
@@ -1610,7 +1749,11 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
             data += 1
     else:
         for i in range(lines):
-            COLITER_NEXT(it, word)
+            if inference_data_t is use_parser_data:
+                COLITER_NEXT(it, word)
+            else:
+                kit = _htable_next(inference_data, kit, &word)
+
             data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
                                        parser.thousands, 1)
             if errno != 0 or p_end[0] or p_end == word:
@@ -1626,11 +1769,11 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
     return 0
 
 cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
-                bint na_filter, kh_str_t *na_hashset):
+                bint na_filter, kh_str_t *na_hashset,
+                inference_data_t inference_data):
     cdef:
         int error, na_count = 0
         size_t i, lines
-        coliter_t it
         int64_t *data
         ndarray result
 
@@ -1640,9 +1783,10 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
     lines = line_end - line_start
     result = np.empty(lines, dtype=np.int64)
     data = <int64_t *> result.data
-    coliter_setup(&it, parser, col, line_start)
+    # compile time
     with nogil:
-        error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count)
+        error = _try_int64_nogil(parser, col, line_start, line_end, na_filter,
+                                 na_hashset, NA, data, &na_count, inference_data)
     if error != 0:
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
@@ -1653,21 +1797,26 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
 
 cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int line_end,
                                  bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data,
-                                 int *na_count) nogil:
+                                 int *na_count, inference_data_t inference_data) nogil:
     cdef:
         int error
         size_t i
         size_t lines = line_end - line_start
         coliter_t it
         const char *word = NULL
-        khiter_t k
+        khiter_t k, kit = 0
 
     na_count[0] = 0
-    coliter_setup(&it, parser, col, line_start)
+    # compile time checks
+    if inference_data_t is use_parser_data:
+        coliter_setup(&it, parser, col, line_start)
 
     if na_filter:
         for i in range(lines):
-            COLITER_NEXT(it, word)
+            if inference_data_t is use_parser_data:
+                COLITER_NEXT(it, word)
+            else:
+                kit = _htable_next(inference_data, kit, &word)
             k = kh_get_str(na_hashset, word)
             # in the hash table
             if k != na_hashset.n_buckets:
@@ -1681,7 +1830,10 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int
                 return error
     else:
         for i in range(lines):
-            COLITER_NEXT(it, word)
+            if inference_data_t is use_parser_data:
+                COLITER_NEXT(it, word)
+            else:
+                kit = _htable_next(inference_data, kit, &word)
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
@@ -1719,6 +1871,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l
         const char *word = NULL
         khiter_t k
     na_count[0] = 0
+
     coliter_setup(&it, parser, col, line_start)
 
     if na_filter:
@@ -1749,7 +1902,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l
 
 cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
                     bint na_filter, const kh_str_t *na_hashset,
-                    const kh_str_t *true_hashset, const kh_str_t *false_hashset):
+                    const kh_str_t *true_hashset, const kh_str_t *false_hashset,
+                    inference_data_t inference_data):
     cdef:
         int error, na_count = 0
         size_t i, lines
@@ -1765,8 +1919,9 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
     result = np.empty(lines, dtype=np.uint8)
     data = <uint8_t *> result.data
     with nogil:
-        error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset,
-                                     true_hashset, false_hashset, NA, data, &na_count)
+        error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter,
+                                     na_hashset, true_hashset, false_hashset, NA, data,
+                                     &na_count, inference_data)
     if error != 0:
         return None, None
     return result.view(np.bool_), na_count
@@ -1774,21 +1929,27 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
 cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int line_end,
                                      bint na_filter, const kh_str_t *na_hashset,
                                      const kh_str_t *true_hashset, const kh_str_t *false_hashset,
-                                     uint8_t NA, uint8_t *data, int *na_count) nogil:
+                                     uint8_t NA, uint8_t *data, int *na_count,
+                                     inference_data_t inference_data) nogil:
     cdef:
         int error = 0
         size_t i
         size_t lines = line_end - line_start
         coliter_t it
         const char *word = NULL
-        khiter_t k
+        khiter_t k, kit = 0
 
     na_count[0] = 0
-    coliter_setup(&it, parser, col, line_start)
+    # compile time
+    if inference_data_t is use_parser_data:
+        coliter_setup(&it, parser, col, line_start)
 
     if na_filter:
         for i in range(lines):
-            COLITER_NEXT(it, word)
+            if inference_data_t is use_parser_data:
+                COLITER_NEXT(it, word)
+            else:
+                kit = _htable_next(inference_data, kit, &word)
 
             k = kh_get_str(na_hashset, word)
             # in the hash table
@@ -1815,7 +1976,10 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
             data += 1
     else:
         for i in range(lines):
-            COLITER_NEXT(it, word)
+            if inference_data_t is use_parser_data:
+                COLITER_NEXT(it, word)
+            else:
+                kit = _htable_next(inference_data, kit, &word)
 
             k = kh_get_str(true_hashset, word)
             if k != true_hashset.n_buckets:
@@ -1836,6 +2000,19 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
 
     return 0
 
+
+cdef inline khiter_t _htable_next(kh_str_t *table, khiter_t k, char **word) nogil:
+    """given starting iterator, asssign next valid key to word and return
+    the next iterator"""
+    while k < table.n_buckets:
+        if kh_exist_str(table, k):
+            break
+        k += 1
+
+    word[0] = table.keys[k]
+    return (k + 1)
+
+
 cdef kh_str_t* kset_from_list(list values) except NULL:
     # caller takes responsibility for freeing the hash table
     cdef:
@@ -1924,7 +2101,12 @@ def _concatenate_chunks(list chunks):
             common_type = np.find_common_type(dtypes, [])
             if common_type == np.object:
                 warning_columns.append(str(name))
-        result[name] = np.concatenate(arrs)
+
+        if is_categorical_dtype(dtypes.pop()):
+            result[name] = union_categoricals(arrs)
+            #np.concatenate([c.codes for c in arrs])
+        else:
+            result[name] = np.concatenate(arrs)
 
     if warning_columns:
         warning_names = ','.join(warning_columns)

From cfa0ce402e5b18fb992b7a56325194faa17e9742 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Sat, 11 Jun 2016 06:30:46 -0500
Subject: [PATCH 02/11] clean up dtype checking, add function specialization

---
 pandas/parser.pyx | 105 ++++++++++++++++++++--------------------------
 1 file changed, 46 insertions(+), 59 deletions(-)

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index 72801c631070b..f67c7ed6ad85f 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -34,7 +34,10 @@ import numpy as np
 cimport util
 
 import pandas.lib as lib
-from pandas.core.common import is_categorical_dtype, CategoricalDtype
+from pandas.core.common import (is_categorical_dtype, CategoricalDtype,
+                                is_integer_dtype, is_float_dtype,
+                                is_bool_dtype, is_object_dtype,
+                                is_string_dtype, is_datetime64_dtype)
 from pandas.core.categorical import Categorical
 from pandas.types.concat import union_categoricals
 
@@ -224,19 +227,13 @@ cdef extern from "parser/tokenizer.h":
     int to_boolean(const char *item, uint8_t *val) nogil
 
 
-# XXX
-# this is a hack - in order to make the inference
-# functions generic (converting either data directly
-# from the parser or from a passed in hash table)
-# we add an "optional" parameter via fused type, that can either
-# be the hash table to parse, or an integer, which is used
-# as a sentinel to specialize the function for reading
-# from the parser.
 
-# This is to avoid duplicating a bunch of code or
-# adding runtime checks, but may be too much
+# to make the inference functions generic
+# add an optional last parameter that is
+# the source of data to be used
+# other than the parser_t
 ctypedef kh_str_t* kh_str_t_p
-ctypedef int use_parser_data
+ctypedef void* use_parser_data
 
 ctypedef fused inference_data_t:
     kh_str_t_p
@@ -421,11 +418,12 @@ cdef class TextReader:
 
         self._set_quoting(quotechar, quoting)
 
-        # TODO: endianness just a placeholder?
+
+        dtype_order = ['int64', 'float64', 'bool', 'object']
         if quoting == QUOTE_NONNUMERIC:
-            self.dtype_cast_order = ['<f8', '<i8', '|b1', '|O8']
-        else:
-            self.dtype_cast_order = ['<i8', '<f8', '|b1', '|O8']
+            # consistent with csv module semantics, cast all to float
+            dtype_order = dtype_order[1:]
+        self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
 
         if comment is not None:
             if len(comment) > 1:
@@ -1108,12 +1106,6 @@ cdef class TextReader:
                     col_dtype = self.dtype
 
             if col_dtype is not None:
-                if not isinstance(col_dtype, basestring):
-                    if isinstance(col_dtype, np.dtype) or is_categorical_dtype(col_dtype):
-                        col_dtype = col_dtype.str
-                    else:
-                        col_dtype = np.dtype(col_dtype).str
-
                 col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end,
                                                              na_filter, 1, na_hashset, na_flist)
 
@@ -1131,7 +1123,7 @@ cdef class TextReader:
                         dt, i, start, end, na_filter, 0, na_hashset, na_flist)
                 except OverflowError:
                     col_res, na_count = self._convert_with_dtype(
-                        '|O8', i, start, end, na_filter, 0, na_hashset, na_flist)
+                        np.dtype('object'), i, start, end, na_filter, 0, na_hashset, na_flist)
 
                 if col_res is not None:
                     break
@@ -1163,41 +1155,38 @@ cdef class TextReader:
                              bint user_dtype,
                              kh_str_t *na_hashset,
                              object na_flist):
-        if dtype[1] == 'i' or dtype[1] == 'u':
-            result, na_count = _try_int64(self.parser, i, start, end,
-                                          na_filter, na_hashset,
-                                          <use_parser_data>NULL)
+        if is_integer_dtype(dtype):
+            result, na_count = _try_int64[use_parser_data](self.parser, i,
+                                                           start, end, na_filter,
+                                                           na_hashset, NULL)
             if user_dtype and na_count is not None:
                 if na_count > 0:
                     raise ValueError("Integer column has NA values in "
                                      "column {column}".format(column=i))
 
-            if result is not None and dtype[1:] != 'i8':
+            if result is not None and dtype != 'int64':
                 result = result.astype(dtype)
 
             return result, na_count
 
-        elif dtype[1] == 'f':
-            result, na_count = _try_double(self.parser, i, start, end,
-                                           na_filter, na_hashset, na_flist,
-                                           <use_parser_data>NULL)
+        elif is_float_dtype(dtype):
+            result, na_count = _try_double[use_parser_data](self.parser, i, start, end,
+                                                            na_filter, na_hashset, na_flist,
+                                                            NULL)
 
-            if result is not None and dtype[1:] != 'f8':
+            if result is not None and dtype != 'float64':
                 result = result.astype(dtype)
             return result, na_count
 
-        elif dtype[1] == 'b':
-            result, na_count = _try_bool_flex(self.parser, i, start, end,
-                                              na_filter, na_hashset,
-                                              self.true_set, self.false_set,
-                                              <use_parser_data>NULL)
+        elif is_bool_dtype(dtype):
+            result, na_count = _try_bool_flex[use_parser_data](self.parser, i, start, end,
+                                                               na_filter, na_hashset,
+                                                               self.true_set, self.false_set,
+                                                               NULL)
             return result, na_count
-        elif dtype[1] == 'c':
-            raise NotImplementedError("the dtype %s is not supported for parsing" % dtype)
-
-        elif dtype[1] == 'S':
+        elif dtype.kind == 'S':
             # TODO: na handling
-            width = int(dtype[2:])
+            width = dtype.itemsize
             if width > 0:
                 result = _to_fw_string(self.parser, i, start, end, width)
                 return result, 0
@@ -1205,8 +1194,8 @@ cdef class TextReader:
             # treat as a regular string parsing
             return self._string_convert(i, start, end, na_filter,
                                        na_hashset)
-        elif dtype[1] == 'U':
-            width = int(dtype[2:])
+        elif dtype.kind == 'U':
+            width = dtype.itemsize
             if width > 0:
                 raise NotImplementedError("the dtype %s is not supported for parsing" % dtype)
 
@@ -1214,19 +1203,18 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         # is this comparison good enough?
-        elif dtype == '|O08':
+        elif is_categorical_dtype(dtype):
             codes, cats, na_count = _categorical_convert(self.parser, i, start,
                                                          end, na_filter, na_hashset,
                                                          na_flist, self.true_set,
                                                          self.false_set, self.c_encoding)
-
             return Categorical(codes, categories=cats, ordered=False,
                                fastpath=True), na_count
-        elif dtype[1] == 'O':
+        elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
         else:
-            if dtype[1] == 'M':
+            if is_datetime64_dtype(dtype):
                  raise TypeError("the dtype %s is not supported for parsing, "
                                  "pass this column using parse_dates instead" % dtype)
             raise TypeError("the dtype %s is not supported for parsing" % dtype)
@@ -1588,7 +1576,7 @@ cdef _categorical_convert(parser_t *parser, int col,
 
             codes[i] = table.vals[k]
 
-
+    # Codes are complete, now inference on cats
     # follow the same inference attempts as
     # normal data (int64, float64, bool, object)
     result, result_na = _try_int64(parser, col, 0, table.n_occupied,
@@ -1603,9 +1591,10 @@ cdef _categorical_convert(parser_t *parser, int col,
         result, result_na = _try_bool_flex(parser, col, 0, table.n_occupied,
                                            na_filter, na_hashset, true_hashset,
                                            false_hashset, table)
-    # duplicated logic here, but doesn't make sense to reuse
-    # other string logic since those paths factorize where we
-    # already have guaranteed uniques
+
+    # if no numeric types parsed, convert to object.
+    # Note that the decoding path logic should sync up with that
+    # of `TextReader.string_convert`
     if result is None:
         i = 0
         result = np.empty(table.n_occupied, dtype=np.object_)
@@ -1694,10 +1683,10 @@ cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end,
 
 
 cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end,
-                           bint na_filter, kh_str_t *na_hashset, bint use_na_flist,
-                           const kh_float64_t *na_flist,
-                           double NA, double *data, int *na_count,
-                           inference_data_t inference_data) nogil:
+                                  bint na_filter, kh_str_t *na_hashset, bint use_na_flist,
+                                  const kh_float64_t *na_flist,
+                                  double NA, double *data, int *na_count,
+                                  inference_data_t inference_data) nogil:
     cdef:
         int error,
         size_t i
@@ -1783,7 +1772,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
     lines = line_end - line_start
     result = np.empty(lines, dtype=np.int64)
     data = <int64_t *> result.data
-    # compile time
     with nogil:
         error = _try_int64_nogil(parser, col, line_start, line_end, na_filter,
                                  na_hashset, NA, data, &na_count, inference_data)
@@ -2104,7 +2092,6 @@ def _concatenate_chunks(list chunks):
 
         if is_categorical_dtype(dtypes.pop()):
             result[name] = union_categoricals(arrs)
-            #np.concatenate([c.codes for c in arrs])
         else:
             result[name] = np.concatenate(arrs)
 

From 849a112a37ee3cb0f9e3cd164ae17fd589343c98 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Sun, 12 Jun 2016 08:33:07 -0500
Subject: [PATCH 03/11] fix some dtype checking

---
 pandas/parser.pyx | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index f67c7ed6ad85f..84d2a52740845 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -38,6 +38,7 @@ from pandas.core.common import (is_categorical_dtype, CategoricalDtype,
                                 is_integer_dtype, is_float_dtype,
                                 is_bool_dtype, is_object_dtype,
                                 is_string_dtype, is_datetime64_dtype)
+from pandas.types.api import pandas_dtype
 from pandas.core.categorical import Categorical
 from pandas.types.concat import union_categoricals
 
@@ -492,22 +493,13 @@ cdef class TextReader:
         self.encoding = encoding
 
         if isinstance(dtype, dict):
-            conv = {}
-            for k in dtype:
-                v = dtype[k]
-                if is_categorical_dtype(v):
-                    v = CategoricalDtype()
-                elif isinstance(v, basestring):
-                    v = np.dtype(v)
-                conv[k] = v
-            dtype = conv
+            dtype = {k: pandas_dtype(dtype[k])
+                     for k in dtype}
         elif dtype is not None:
-            if is_categorical_dtype(dtype):
-                dtype = CategoricalDtype()
-            else:
-                dtype = np.dtype(dtype)
+            dtype = pandas_dtype(dtype)
 
         self.dtype = dtype
+        print dtype
 
         # XXX
         self.noconvert = set()
@@ -1101,7 +1093,8 @@ cdef class TextReader:
                     col_dtype = self.dtype[i]
             else:
                 if self.dtype.names:
-                    col_dtype = self.dtype.descr[i][1]
+                    # structured array
+                    col_dtype = np.dtype(self.dtype.descr[i][1])
                 else:
                     col_dtype = self.dtype
 
@@ -1202,7 +1195,6 @@ cdef class TextReader:
             # unicode variable width
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
-        # is this comparison good enough?
         elif is_categorical_dtype(dtype):
             codes, cats, na_count = _categorical_convert(self.parser, i, start,
                                                          end, na_filter, na_hashset,

From 4e0722d063aad3622eb23d17408eb0c03b81995f Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Sat, 16 Jul 2016 12:06:40 -0500
Subject: [PATCH 04/11] undo type inference add docs and asv

---
 asv_bench/benchmarks/parser_vb.py       |  21 ++
 doc/source/io.rst                       |  34 +++
 doc/source/whatsnew/v0.19.0.txt         |  46 ++++
 pandas/io/tests/parser/c_parser_only.py |  24 ++-
 pandas/parser.pyx                       | 274 +++++++++---------------
 5 files changed, 220 insertions(+), 179 deletions(-)

diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py
index 04f25034638cd..614bfa5ec5d6e 100644
--- a/asv_bench/benchmarks/parser_vb.py
+++ b/asv_bench/benchmarks/parser_vb.py
@@ -114,6 +114,27 @@ def teardown(self):
         os.remove('test.csv')
 
 
+class read_csv_categorical(object):
+    def setup(self):
+        goal_time = 0.2
+
+        N = 100000
+        group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
+        df = DataFrame({'a': np.random.choice(group1, N).astype('object'),
+                        'b': np.random.choice(group1, N).astype('object'),
+                        'c': np.random.choice(group1, N).astype('object')})
+        df.to_csv('strings.csv', index=False)
+
+    def time_read_csv_categorical_post(self):
+        read_csv('strings.csv').apply(pd.Categorical)
+
+    def time_read_csv_categorical_direct(self):
+        read_csv('strings.csv', dtype='category')
+
+    def teardown(self):
+        os.remove('strings.csv')
+
+
 class read_table_multiple_date(object):
     goal_time = 0.2
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
index 2866371cce61a..4ff6a79a3ef64 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -500,6 +500,40 @@ worth trying.
    data that was read in. It is important to note that the overall column will be
    marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes.
 
+Specifying Categorical dtype
+''''''''''''''''''''''''''''
+
+.. versionadded:: 0.19.0
+
+`Categorical` columns can be parsed directly by specifying `dtype='category'`
+
+.. ipython :: python
+
+   data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
+
+   pd.read_csv(StringIO(data))
+   pd.read_csv(StringIO(data)).dtypes
+   pd.read_csv(StringIO(data), dtype='category').dtypes
+
+Individual columns can be parsed as a `Categorical` using a dict specification
+
+.. ipython :: python
+
+   pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
+
+.. note::
+
+   The resulting categories will always be parsed as string (object dtype).
+   Numeric categories can be converted using the :func:`pd.to_numeric` function.
+
+   .. ipython :: python
+
+      df = pd.read_csv(StringIO(data), dtype='category')
+      df.dtypes
+      df['col3']
+      df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
+      df['col3']
+>>>>>>> undo type inference add docs and asv
 
 
 Naming and Using Columns
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
index 59a106291dad8..9d51da1233dd8 100644
--- a/doc/source/whatsnew/v0.19.0.txt
+++ b/doc/source/whatsnew/v0.19.0.txt
@@ -195,6 +195,14 @@ default of the index) in a DataFrame.
 :func:`read_csv` has improved support for duplicate column names
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+.. ipython:: python
+   :suppress:
+
+   from pandas.compat import StringIO
+
+.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support:
+
+
 :ref:`Duplicate column names <io.dupe_names>` are now supported in :func:`read_csv` whether
 they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`)
 
@@ -222,6 +230,44 @@ New behaviour:
 
    In [2]: pd.read_csv(StringIO(data), names=names)
 
+
+.. _whatsnew_0190.enhancements.read_csv_categorical:
+
+:func:`read_csv` supports parsing `Categorical` directly
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :func:`read_csv` function now supports parsing a `Categorical` column when
+specified as a dtype (:issue:`10153`).  Depending on the structure of the data,
+this can result in a faster parse time and lower memory usage, compared to
+converting to `Categorical` after parsing.
+
+.. ipython :: python
+
+   data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
+
+   pd.read_csv(StringIO(data))
+   pd.read_csv(StringIO(data)).dtypes
+   pd.read_csv(StringIO(data), dtype='category').dtypes
+
+Individual columns can be parsed as a `Categorical` using a dict specification
+
+.. ipython :: python
+
+   pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
+
+.. note::
+
+   The resulting categories will always be parsed as string (object dtype).
+   Numeric categories can be converted using the :func:`pd.to_numeric` function.
+
+   .. ipython :: python
+
+      df = pd.read_csv(StringIO(data), dtype='category')
+      df.dtypes
+      df['col3']
+      df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
+      df['col3']
+
 .. _whatsnew_0190.enhancements.semi_month_offsets:
 
 Semi-Month Offsets
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
index 9f264cadfa473..0d706a2fe2c4b 100644
--- a/pandas/io/tests/parser/c_parser_only.py
+++ b/pandas/io/tests/parser/c_parser_only.py
@@ -191,9 +191,9 @@ def test_categorical_dtype(self):
 1,a,3.4
 1,a,3.4
 2,b,4.5"""
-        expected = pd.DataFrame({'a': Categorical([1, 1, 2]),
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
                                  'b': Categorical(['a', 'a', 'b']),
-                                 'c': Categorical([3.4, 3.4, 4.5])})
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
         actual = self.read_csv(StringIO(data), dtype='category')
         tm.assert_frame_equal(actual, expected)
 
@@ -205,6 +205,26 @@ def test_categorical_dtype(self):
                                                       'c': CategoricalDtype()})
         tm.assert_frame_equal(actual, expected)
 
+        actual = self.read_csv(StringIO(data), dtype={'b': 'category'})
+        expected = pd.DataFrame({'a': [1, 1, 2],
+                                 'b': Categorical(['a', 'a', 'b']),
+                                 'c': [3.4, 3.4, 4.5]})
+        tm.assert_frame_equal(actual, expected)
+
+    def test_categorical_dtype_encoding(self):
+        # GH 10153
+        cases = [
+            ('unicode_series.csv', 'latin-1'),
+            ('utf16_ex.txt', 'utf-16')
+        ]
+
+        for f, encoding in cases:
+            pth = tm.get_data_path(f)
+            expected = self.read_csv(pth, header=None, encoding=encoding)
+            result = self.read_csv(pth, header=None, encoding=encoding, dtype='category')
+            result = result.apply(lambda x: x.astype(object))
+            tm.assert_frame_equal(actual, expected)
+
     def test_pass_dtype_as_recarray(self):
         if compat.is_platform_windows() and self.low_memory:
             raise nose.SkipTest(
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index 84d2a52740845..e680cab5ff90c 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -34,11 +34,11 @@ import numpy as np
 cimport util
 
 import pandas.lib as lib
-from pandas.core.common import (is_categorical_dtype, CategoricalDtype,
-                                is_integer_dtype, is_float_dtype,
-                                is_bool_dtype, is_object_dtype,
-                                is_string_dtype, is_datetime64_dtype)
-from pandas.types.api import pandas_dtype
+from pandas.types.common import (is_categorical_dtype, CategoricalDtype,
+                                 is_integer_dtype, is_float_dtype,
+                                 is_bool_dtype, is_object_dtype,
+                                 is_string_dtype, is_datetime64_dtype,
+                                 pandas_dtype)
 from pandas.core.categorical import Categorical
 from pandas.types.concat import union_categoricals
 
@@ -228,18 +228,6 @@ cdef extern from "parser/tokenizer.h":
     int to_boolean(const char *item, uint8_t *val) nogil
 
 
-
-# to make the inference functions generic
-# add an optional last parameter that is
-# the source of data to be used
-# other than the parser_t
-ctypedef kh_str_t* kh_str_t_p
-ctypedef void* use_parser_data
-
-ctypedef fused inference_data_t:
-    kh_str_t_p
-    use_parser_data
-
 cdef extern from "parser/io.h":
     void *new_mmap(char *fname)
     int del_mmap(void *src)
@@ -499,7 +487,6 @@ cdef class TextReader:
             dtype = pandas_dtype(dtype)
 
         self.dtype = dtype
-        print dtype
 
         # XXX
         self.noconvert = set()
@@ -706,6 +693,7 @@ cdef class TextReader:
             int status
             Py_ssize_t size
             char *errors = "strict"
+            cdef StringPath path = _string_path(self.c_encoding)
 
         header = []
 
@@ -735,20 +723,18 @@ cdef class TextReader:
                     field_count = self.parser.line_fields[hr]
                     start = self.parser.line_start[hr]
 
-                # TODO: Py3 vs. Py2
                 counts = {}
                 unnamed_count = 0
                 for i in range(field_count):
                     word = self.parser.words[start + i]
 
-                    if self.c_encoding == NULL and not PY3:
+                    if path == CSTRING:
                         name = PyBytes_FromString(word)
-                    else:
-                        if self.c_encoding == NULL or self.c_encoding == b'utf-8':
-                            name = PyUnicode_FromString(word)
-                        else:
-                            name = PyUnicode_Decode(word, strlen(word),
-                                                    self.c_encoding, errors)
+                    elif path == UTF8:
+                        name = PyUnicode_FromString(word)
+                    elif path == ENCODED:
+                        name = PyUnicode_Decode(word, strlen(word),
+                                                self.c_encoding, errors)
 
                     if name == '':
                         if self.has_mi_columns:
@@ -1149,9 +1135,8 @@ cdef class TextReader:
                              kh_str_t *na_hashset,
                              object na_flist):
         if is_integer_dtype(dtype):
-            result, na_count = _try_int64[use_parser_data](self.parser, i,
-                                                           start, end, na_filter,
-                                                           na_hashset, NULL)
+            result, na_count = _try_int64(self.parser, i, start, end, na_filter,
+                                          na_hashset)
             if user_dtype and na_count is not None:
                 if na_count > 0:
                     raise ValueError("Integer column has NA values in "
@@ -1163,19 +1148,17 @@ cdef class TextReader:
             return result, na_count
 
         elif is_float_dtype(dtype):
-            result, na_count = _try_double[use_parser_data](self.parser, i, start, end,
-                                                            na_filter, na_hashset, na_flist,
-                                                            NULL)
+            result, na_count = _try_double(self.parser, i, start, end,
+                                           na_filter, na_hashset, na_flist)
 
             if result is not None and dtype != 'float64':
                 result = result.astype(dtype)
             return result, na_count
 
         elif is_bool_dtype(dtype):
-            result, na_count = _try_bool_flex[use_parser_data](self.parser, i, start, end,
-                                                               na_filter, na_hashset,
-                                                               self.true_set, self.false_set,
-                                                               NULL)
+            result, na_count = _try_bool_flex(self.parser, i, start, end,
+                                              na_filter, na_hashset,
+                                              self.true_set, self.false_set)
             return result, na_count
         elif dtype.kind == 'S':
             # TODO: na handling
@@ -1186,7 +1169,7 @@ cdef class TextReader:
 
             # treat as a regular string parsing
             return self._string_convert(i, start, end, na_filter,
-                                       na_hashset)
+                                        na_hashset)
         elif dtype.kind == 'U':
             width = dtype.itemsize
             if width > 0:
@@ -1198,8 +1181,7 @@ cdef class TextReader:
         elif is_categorical_dtype(dtype):
             codes, cats, na_count = _categorical_convert(self.parser, i, start,
                                                          end, na_filter, na_hashset,
-                                                         na_flist, self.true_set,
-                                                         self.false_set, self.c_encoding)
+                                                         self.c_encoding)
             return Categorical(codes, categories=cats, ordered=False,
                                fastpath=True), na_count
         elif is_object_dtype(dtype):
@@ -1213,30 +1195,19 @@ cdef class TextReader:
 
     cdef _string_convert(self, Py_ssize_t i, int start, int end,
                          bint na_filter, kh_str_t *na_hashset):
-        if PY3:
-            if self.c_encoding != NULL:
-                if self.c_encoding == b"utf-8":
-                    return _string_box_utf8(self.parser, i, start, end,
-                                            na_filter, na_hashset)
-                else:
-                    return _string_box_decode(self.parser, i, start, end,
-                                              na_filter, na_hashset,
-                                              self.c_encoding)
-            else:
-                return _string_box_utf8(self.parser, i, start, end,
-                                        na_filter, na_hashset)
-        else:
-            if self.c_encoding != NULL:
-                if self.c_encoding == b"utf-8":
-                    return _string_box_utf8(self.parser, i, start, end,
-                                            na_filter, na_hashset)
-                else:
-                    return _string_box_decode(self.parser, i, start, end,
-                                              na_filter, na_hashset,
-                                              self.c_encoding)
-            else:
-                return _string_box_factorize(self.parser, i, start, end,
-                                             na_filter, na_hashset)
+
+        cdef StringPath path = _string_path(self.c_encoding)
+
+        if path == UTF8:
+            return _string_box_utf8(self.parser, i, start, end, na_filter,
+                                    na_hashset)
+        elif path == ENCODED:
+            return _string_box_decode(self.parser, i, start, end,
+                                      na_filter, na_hashset, self.c_encoding)
+        elif path == CSTRING:
+            return _string_box_factorize(self.parser, i, start, end,
+                                         na_filter, na_hashset)
+
 
     def _get_converter(self, i, name):
         if self.converters is None:
@@ -1348,6 +1319,19 @@ def _maybe_upcast(arr):
 
     return arr
 
+cdef enum StringPath:
+     CSTRING
+     UTF8
+     ENCODED
+
+# factored out logic to pick string converter
+cdef inline StringPath _string_path(char *encoding):
+    if encoding != NULL and encoding != b"utf-8":
+        return ENCODED
+    elif PY3 or encoding != NULL:
+        return UTF8
+    else:
+        return CSTRING
 # ----------------------------------------------------------------------
 # Type conversions / inference support code
 
@@ -1521,8 +1505,6 @@ cdef _string_box_decode(parser_t *parser, int col,
 cdef _categorical_convert(parser_t *parser, int col,
                           int line_start, int line_end,
                           bint na_filter, kh_str_t *na_hashset,
-                          object na_flist, const kh_str_t *true_hashset,
-                          const kh_str_t *false_hashset,
                           char *encoding):
     "Convert column data into codes, categories"
     cdef:
@@ -1531,19 +1513,22 @@ cdef _categorical_convert(parser_t *parser, int col,
         size_t lines
         coliter_t it
         const char *word = NULL
+
         int64_t NA = -1
         int64_t[:] codes
-        size_t current_category = 0
+        int64_t current_category = 0
 
         char *errors = "strict"
+        cdef StringPath path = _string_path(encoding)
 
         int ret = 0
         kh_str_t *table
-
         khiter_t k
 
     lines = line_end - line_start
     codes = np.empty(lines, dtype=np.int64)
+    # factorize parsed values, creating a hash table
+    # bytes -> category
     with nogil:
         table = kh_init_str()
         coliter_setup(&it, parser, col, line_start)
@@ -1568,44 +1553,25 @@ cdef _categorical_convert(parser_t *parser, int col,
 
             codes[i] = table.vals[k]
 
-    # Codes are complete, now inference on cats
-    # follow the same inference attempts as
-    # normal data (int64, float64, bool, object)
-    result, result_na = _try_int64(parser, col, 0, table.n_occupied,
-                                   na_filter, na_hashset, table)
-    if result is None:
-        result, result_na = _try_double(parser, col, 0, table.n_occupied,
-                                        na_filter, na_hashset, na_flist,
-                                        table)
-    if result is None:
-        # bool categorical doesn't really make sense, but following the
-        # inference path for now
-        result, result_na = _try_bool_flex(parser, col, 0, table.n_occupied,
-                                           na_filter, na_hashset, true_hashset,
-                                           false_hashset, table)
-
-    # if no numeric types parsed, convert to object.
-    # Note that the decoding path logic should sync up with that
-    # of `TextReader.string_convert`
-    if result is None:
-        i = 0
-        result = np.empty(table.n_occupied, dtype=np.object_)
-        if encoding != NULL and encoding != b"utf-8":
-            for k in range(table.n_buckets):
-                if kh_exist_str(table, k):
-                    size = strlen(table.keys[k])
-                    result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
-                    i += 1
-        elif PY3 or encoding != NULL:
-            for k in range(table.n_buckets):
-                if kh_exist_str(table, k):
-                    result[i] = PyUnicode_FromString(table.keys[k])
-                    i += 1
-        else:
-            for k in range(table.n_buckets):
-                if kh_exist_str(table, k):
-                    result[i] = PyBytes_FromString(table.keys[k])
-                    i += 1
+    # parse and box categories to python strings
+    i = 0
+    result = np.empty(table.n_occupied, dtype=np.object_)
+    if path == ENCODED:
+        for k in range(table.n_buckets):
+            if kh_exist_str(table, k):
+                size = strlen(table.keys[k])
+                result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
+                i += 1
+    elif path == UTF8:
+        for k in range(table.n_buckets):
+            if kh_exist_str(table, k):
+                result[i] = PyUnicode_FromString(table.keys[k])
+                i += 1
+    elif path == CSTRING:
+        for k in range(table.n_buckets):
+            if kh_exist_str(table, k):
+                result[i] = PyBytes_FromString(table.keys[k])
+                i += 1
 
     kh_destroy_str(table)
     return np.asarray(codes), result, na_count
@@ -1645,12 +1611,12 @@ cdef char* cinf = b'inf'
 cdef char* cposinf = b'+inf'
 cdef char* cneginf = b'-inf'
 
-cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end,
-                        bint na_filter, kh_str_t *na_hashset, object na_flist,
-                        inference_data_t inference_data):
+cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
+                 bint na_filter, kh_str_t *na_hashset, object na_flist):
     cdef:
         int error, na_count = 0
         size_t i, lines
+        coliter_t it
         const char *word = NULL
         char *p_end
         double *data
@@ -1666,19 +1632,17 @@ cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end,
     na_fset = kset_float64_from_list(na_flist)
     with nogil:
         error = _try_double_nogil(parser, col, line_start, line_end,
-                                  na_filter, na_hashset, use_na_flist, na_fset, NA, data,
-                                  &na_count, inference_data)
+                                  na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count)
     kh_destroy_float64(na_fset)
     if error != 0:
         return None, None
     return result, na_count
 
-
 cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end,
                                   bint na_filter, kh_str_t *na_hashset, bint use_na_flist,
                                   const kh_float64_t *na_flist,
-                                  double NA, double *data, int *na_count,
-                                  inference_data_t inference_data) nogil:
+                                  double NA,
+                                  double *data, int *na_count) nogil:
     cdef:
         int error,
         size_t i
@@ -1687,24 +1651,15 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
         const char *word = NULL
         char *p_end
         khiter_t k, k64
-        # only used with passed in data
-        khiter_t kit = 0
 
     global errno
 
     na_count[0] = 0
-
-    # these type checks specialize at compile time
-    # see typedefs
-    if inference_data_t is use_parser_data:
-        coliter_setup(&it, parser, col, line_start)
+    coliter_setup(&it, parser, col, line_start)
 
     if na_filter:
         for i in range(lines):
-            if inference_data_t is use_parser_data:
-                COLITER_NEXT(it, word)
-            else:
-                kit = _htable_next(inference_data, kit, &word)
+            COLITER_NEXT(it, word)
 
             k = kh_get_str(na_hashset, word)
             # in the hash table
@@ -1730,11 +1685,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
             data += 1
     else:
         for i in range(lines):
-            if inference_data_t is use_parser_data:
-                COLITER_NEXT(it, word)
-            else:
-                kit = _htable_next(inference_data, kit, &word)
-
+            COLITER_NEXT(it, word)
             data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
                                        parser.thousands, 1)
             if errno != 0 or p_end[0] or p_end == word:
@@ -1750,11 +1701,11 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
     return 0
 
 cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
-                bint na_filter, kh_str_t *na_hashset,
-                inference_data_t inference_data):
+                bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
         size_t i, lines
+        coliter_t it
         int64_t *data
         ndarray result
 
@@ -1764,9 +1715,9 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
     lines = line_end - line_start
     result = np.empty(lines, dtype=np.int64)
     data = <int64_t *> result.data
+    coliter_setup(&it, parser, col, line_start)
     with nogil:
-        error = _try_int64_nogil(parser, col, line_start, line_end, na_filter,
-                                 na_hashset, NA, data, &na_count, inference_data)
+        error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count)
     if error != 0:
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
@@ -1777,26 +1728,21 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
 
 cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int line_end,
                                  bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data,
-                                 int *na_count, inference_data_t inference_data) nogil:
+                                 int *na_count) nogil:
     cdef:
         int error
         size_t i
         size_t lines = line_end - line_start
         coliter_t it
         const char *word = NULL
-        khiter_t k, kit = 0
+        khiter_t k
 
     na_count[0] = 0
-    # compile time checks
-    if inference_data_t is use_parser_data:
-        coliter_setup(&it, parser, col, line_start)
+    coliter_setup(&it, parser, col, line_start)
 
     if na_filter:
         for i in range(lines):
-            if inference_data_t is use_parser_data:
-                COLITER_NEXT(it, word)
-            else:
-                kit = _htable_next(inference_data, kit, &word)
+            COLITER_NEXT(it, word)
             k = kh_get_str(na_hashset, word)
             # in the hash table
             if k != na_hashset.n_buckets:
@@ -1810,10 +1756,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int
                 return error
     else:
         for i in range(lines):
-            if inference_data_t is use_parser_data:
-                COLITER_NEXT(it, word)
-            else:
-                kit = _htable_next(inference_data, kit, &word)
+            COLITER_NEXT(it, word)
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
@@ -1882,8 +1825,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l
 
 cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
                     bint na_filter, const kh_str_t *na_hashset,
-                    const kh_str_t *true_hashset, const kh_str_t *false_hashset,
-                    inference_data_t inference_data):
+                    const kh_str_t *true_hashset, const kh_str_t *false_hashset):
     cdef:
         int error, na_count = 0
         size_t i, lines
@@ -1899,9 +1841,8 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
     result = np.empty(lines, dtype=np.uint8)
     data = <uint8_t *> result.data
     with nogil:
-        error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter,
-                                     na_hashset, true_hashset, false_hashset, NA, data,
-                                     &na_count, inference_data)
+        error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset,
+                                     true_hashset, false_hashset, NA, data, &na_count)
     if error != 0:
         return None, None
     return result.view(np.bool_), na_count
@@ -1909,27 +1850,21 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
 cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int line_end,
                                      bint na_filter, const kh_str_t *na_hashset,
                                      const kh_str_t *true_hashset, const kh_str_t *false_hashset,
-                                     uint8_t NA, uint8_t *data, int *na_count,
-                                     inference_data_t inference_data) nogil:
+                                     uint8_t NA, uint8_t *data, int *na_count) nogil:
     cdef:
         int error = 0
         size_t i
         size_t lines = line_end - line_start
         coliter_t it
         const char *word = NULL
-        khiter_t k, kit = 0
+        khiter_t k
 
     na_count[0] = 0
-    # compile time
-    if inference_data_t is use_parser_data:
-        coliter_setup(&it, parser, col, line_start)
+    coliter_setup(&it, parser, col, line_start)
 
     if na_filter:
         for i in range(lines):
-            if inference_data_t is use_parser_data:
-                COLITER_NEXT(it, word)
-            else:
-                kit = _htable_next(inference_data, kit, &word)
+            COLITER_NEXT(it, word)
 
             k = kh_get_str(na_hashset, word)
             # in the hash table
@@ -1956,10 +1891,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
             data += 1
     else:
         for i in range(lines):
-            if inference_data_t is use_parser_data:
-                COLITER_NEXT(it, word)
-            else:
-                kit = _htable_next(inference_data, kit, &word)
+            COLITER_NEXT(it, word)
 
             k = kh_get_str(true_hashset, word)
             if k != true_hashset.n_buckets:
@@ -1981,18 +1913,6 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
     return 0
 
 
-cdef inline khiter_t _htable_next(kh_str_t *table, khiter_t k, char **word) nogil:
-    """given starting iterator, asssign next valid key to word and return
-    the next iterator"""
-    while k < table.n_buckets:
-        if kh_exist_str(table, k):
-            break
-        k += 1
-
-    word[0] = table.keys[k]
-    return (k + 1)
-
-
 cdef kh_str_t* kset_from_list(list values) except NULL:
     # caller takes responsibility for freeing the hash table
     cdef:

From 249094918be25a4b021806744fe2cd8f62389ead Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Sat, 16 Jul 2016 13:59:31 -0500
Subject: [PATCH 05/11] fix hash table ordering, null categories

---
 asv_bench/benchmarks/parser_vb.py       |  4 +-
 pandas/io/tests/parser/c_parser_only.py | 52 +++++++++++++++++++------
 pandas/parser.pyx                       | 13 +++----
 pandas/tools/tests/test_concat.py       |  3 ++
 pandas/types/concat.py                  |  1 +
 5 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py
index 614bfa5ec5d6e..6dc8bffd6dac9 100644
--- a/asv_bench/benchmarks/parser_vb.py
+++ b/asv_bench/benchmarks/parser_vb.py
@@ -115,9 +115,9 @@ def teardown(self):
 
 
 class read_csv_categorical(object):
-    def setup(self):
-        goal_time = 0.2
+    goal_time = 0.2
 
+    def setup(self):
         N = 100000
         group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
         df = DataFrame({'a': np.random.choice(group1, N).astype('object'),
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
index 0d706a2fe2c4b..5ab2b6406ad6d 100644
--- a/pandas/io/tests/parser/c_parser_only.py
+++ b/pandas/io/tests/parser/c_parser_only.py
@@ -211,19 +211,49 @@ def test_categorical_dtype(self):
                                  'c': [3.4, 3.4, 4.5]})
         tm.assert_frame_equal(actual, expected)
 
+        actual = self.read_csv(StringIO(data), dtype={1: 'category'})
+        tm.assert_frame_equal(actual, expected)
+
+        # unsorted
+        data = """a,b,c
+1,b,3.4
+1,b,3.4
+2,a,4.5"""
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
+                                 'b': Categorical.from_codes([0, 0, 1],
+                                                             ['b', 'a']),
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
+        actual = self.read_csv(StringIO(data), dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
+        # missing
+        data = """a,b,c
+1,b,3.4
+1,nan,3.4
+2,a,4.5"""
+        expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
+                                 'b': Categorical.from_codes([0, -1, 1],
+                                                             ['b', 'a']),
+                                 'c': Categorical(['3.4', '3.4', '4.5'])})
+        actual = self.read_csv(StringIO(data), dtype='category')
+        tm.assert_frame_equal(actual, expected)
+
     def test_categorical_dtype_encoding(self):
         # GH 10153
-        cases = [
-            ('unicode_series.csv', 'latin-1'),
-            ('utf16_ex.txt', 'utf-16')
-        ]
-
-        for f, encoding in cases:
-            pth = tm.get_data_path(f)
-            expected = self.read_csv(pth, header=None, encoding=encoding)
-            result = self.read_csv(pth, header=None, encoding=encoding, dtype='category')
-            result = result.apply(lambda x: x.astype(object))
-            tm.assert_frame_equal(actual, expected)
+        pth = tm.get_data_path('unicode_series.csv')
+        encoding = 'latin-1'
+        expected = self.read_csv(pth, header=None, encoding=encoding)
+        actual = self.read_csv(pth, header=None, encoding=encoding,
+                               dtype={1: 'category'})
+        actual[1] = actual[1].astype(object)
+        tm.assert_frame_equal(actual, expected)
+
+        pth = tm.get_data_path('utf16_ex.txt')
+        encoding = 'utf-16'
+        expected = self.read_table(pth, encoding=encoding)
+        actual = self.read_table(pth, encoding=encoding, dtype='category')
+        actual = actual.apply(lambda x: x.astype(object))
+        tm.assert_frame_equal(actual, expected)
 
     def test_pass_dtype_as_recarray(self):
         if compat.is_platform_windows() and self.low_memory:
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index e680cab5ff90c..3809c82654312 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -1527,8 +1527,9 @@ cdef _categorical_convert(parser_t *parser, int col,
 
     lines = line_end - line_start
     codes = np.empty(lines, dtype=np.int64)
+
     # factorize parsed values, creating a hash table
-    # bytes -> category
+    # bytes -> category code
     with nogil:
         table = kh_init_str()
         coliter_setup(&it, parser, col, line_start)
@@ -1554,24 +1555,20 @@ cdef _categorical_convert(parser_t *parser, int col,
             codes[i] = table.vals[k]
 
     # parse and box categories to python strings
-    i = 0
     result = np.empty(table.n_occupied, dtype=np.object_)
     if path == ENCODED:
         for k in range(table.n_buckets):
             if kh_exist_str(table, k):
                 size = strlen(table.keys[k])
-                result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
-                i += 1
+                result[table.vals[k]] = PyUnicode_Decode(table.keys[k], size, encoding, errors)
     elif path == UTF8:
         for k in range(table.n_buckets):
             if kh_exist_str(table, k):
-                result[i] = PyUnicode_FromString(table.keys[k])
-                i += 1
+                result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
     elif path == CSTRING:
         for k in range(table.n_buckets):
             if kh_exist_str(table, k):
-                result[i] = PyBytes_FromString(table.keys[k])
-                i += 1
+                result[table.vals[k]] = PyBytes_FromString(table.keys[k])
 
     kh_destroy_str(table)
     return np.asarray(codes), result, na_count
diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
index 225ba533161b3..e3cc60e2856c2 100644
--- a/pandas/tools/tests/test_concat.py
+++ b/pandas/tools/tests/test_concat.py
@@ -850,6 +850,9 @@ def test_union_categorical(self):
             ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
             ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
 
+            (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
+             ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
+
             (pd.date_range('2014-01-01', '2014-01-05'),
              pd.date_range('2014-01-06', '2014-01-07'),
              pd.date_range('2014-01-01', '2014-01-07')),
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index a7fd692cfb9cf..40268c37db393 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -240,6 +240,7 @@ def union_categoricals(to_union, sort_categories=False):
         Emmpty list of categoricals passed
     """
     from pandas import Index, Categorical
+    from pandas.core.algorithms import take_1d
 
     if len(to_union) == 0:
         raise ValueError('No Categoricals to union')

From 12547687bc16924c898d0ae246670e2453c6cc04 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Thu, 21 Jul 2016 19:50:59 -0500
Subject: [PATCH 06/11] doc fixups; addl tests

---
 doc/source/io.rst                       | 15 +++++++++------
 doc/source/whatsnew/v0.19.0.txt         | 21 ++++++++++++---------
 pandas/io/tests/parser/c_parser_only.py | 22 ++++++++++++++++++++++
 pandas/parser.pyx                       |  8 ++++----
 4 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 4ff6a79a3ef64..81fb2871df3c0 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -500,12 +500,14 @@ worth trying.
    data that was read in. It is important to note that the overall column will be
    marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes.
 
+.. _io.categorical:
+
 Specifying Categorical dtype
 ''''''''''''''''''''''''''''
 
 .. versionadded:: 0.19.0
 
-`Categorical` columns can be parsed directly by specifying `dtype='category'`
+``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
 
 .. ipython :: python
 
@@ -515,25 +517,26 @@ Specifying Categorical dtype
    pd.read_csv(StringIO(data)).dtypes
    pd.read_csv(StringIO(data), dtype='category').dtypes
 
-Individual columns can be parsed as a `Categorical` using a dict specification
+Individual columns can be parsed as a ``Categorical`` using a dict specification
 
-.. ipython :: python
+.. ipython:: python
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
 .. note::
 
    The resulting categories will always be parsed as string (object dtype).
-   Numeric categories can be converted using the :func:`pd.to_numeric` function.
+   If the categories are numeric they can be converted using the
+   :func:`pd.to_numeric` function, or as appropriate, another converter
+   such as :func:`pd.to_datetime`.
 
-   .. ipython :: python
+   .. ipython:: python
 
       df = pd.read_csv(StringIO(data), dtype='category')
       df.dtypes
       df['col3']
       df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
       df['col3']
->>>>>>> undo type inference add docs and asv
 
 
 Naming and Using Columns
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
index 9d51da1233dd8..db3381d558f1e 100644
--- a/doc/source/whatsnew/v0.19.0.txt
+++ b/doc/source/whatsnew/v0.19.0.txt
@@ -12,6 +12,7 @@ Highlights include:
 - :func:`merge_asof` for asof-style time-series joining, see :ref:`here <whatsnew_0190.enhancements.asof_merge>`
 - ``.rolling()`` are now time-series aware, see :ref:`here <whatsnew_0190.enhancements.rolling_ts>`
 - pandas development api, see :ref:`here <whatsnew_0190.dev_api>`
+- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here <whatsnew_0190.enhancements.read_csv_categorical>`
 
 .. contents:: What's new in v0.19.0
     :local:
@@ -233,15 +234,15 @@ New behaviour:
 
 .. _whatsnew_0190.enhancements.read_csv_categorical:
 
-:func:`read_csv` supports parsing `Categorical` directly
+:func:`read_csv` supports parsing ``Categorical`` directly
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The :func:`read_csv` function now supports parsing a `Categorical` column when
+The :func:`read_csv` function now supports parsing a ``Categorical`` column when
 specified as a dtype (:issue:`10153`).  Depending on the structure of the data,
-this can result in a faster parse time and lower memory usage, compared to
-converting to `Categorical` after parsing.
+this can result in a faster parse time and lower memory usage compared to
+converting to ``Categorical`` after parsing.  See the io :ref:`docs here <io.categorical>`
 
-.. ipython :: python
+.. ipython:: python
 
    data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
 
@@ -249,18 +250,20 @@ converting to `Categorical` after parsing.
    pd.read_csv(StringIO(data)).dtypes
    pd.read_csv(StringIO(data), dtype='category').dtypes
 
-Individual columns can be parsed as a `Categorical` using a dict specification
+Individual columns can be parsed as a ``Categorical`` using a dict specification
 
-.. ipython :: python
+.. ipython:: python
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
 .. note::
 
    The resulting categories will always be parsed as string (object dtype).
-   Numeric categories can be converted using the :func:`pd.to_numeric` function.
+   If the categories are numeric they can be converted using the
+   :func:`pd.to_numeric` function, or as appropriate, another converter
+   such as :func:`pd.to_datetime`.
 
-   .. ipython :: python
+   .. ipython:: python
 
       df = pd.read_csv(StringIO(data), dtype='category')
       df.dtypes
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
index 5ab2b6406ad6d..a04113e0b3dba 100644
--- a/pandas/io/tests/parser/c_parser_only.py
+++ b/pandas/io/tests/parser/c_parser_only.py
@@ -136,6 +136,11 @@ def test_passing_dtype(self):
                               dtype={'A': 'timedelta64', 'B': 'float64'},
                               index_col=0)
 
+            # valid but unsupported - fixed width unicode string
+            self.assertRaises(TypeError, self.read_csv, path,
+                              dtype={'A': 'U8'},
+                              index_col=0)
+
         # see gh-12048: empty frame
         actual = self.read_csv(StringIO('A,B'), dtype=str)
         expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
@@ -255,6 +260,23 @@ def test_categorical_dtype_encoding(self):
         actual = actual.apply(lambda x: x.astype(object))
         tm.assert_frame_equal(actual, expected)
 
+    def test_categorical_dtype_chunksize(self):
+        # GH 10153
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        expecteds = [pd.DataFrame({'a': [1, 1],
+                                   'b': Categorical(['a', 'b'])}),
+                     pd.DataFrame({'a': [1, 2],
+                                   'b': Categorical(['b', 'c'])})]
+        actuals = self.read_csv(StringIO(data), dtype={'b':'category'},
+                                chunksize=2)
+
+        for actual, expected in zip(actuals, expecteds):
+            tm.assert_frame_equal(actual, expected)
+
     def test_pass_dtype_as_recarray(self):
         if compat.is_platform_windows() and self.low_memory:
             raise nose.SkipTest(
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index 3809c82654312..df1d8de0f99b7 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -1173,7 +1173,7 @@ cdef class TextReader:
         elif dtype.kind == 'U':
             width = dtype.itemsize
             if width > 0:
-                raise NotImplementedError("the dtype %s is not supported for parsing" % dtype)
+                raise TypeError("the dtype %s is not supported for parsing" % dtype)
 
             # unicode variable width
             return self._string_convert(i, start, end, na_filter,
@@ -1187,10 +1187,10 @@ cdef class TextReader:
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
+        elif is_datetime64_dtype(dtype):
+            raise TypeError("the dtype %s is not supported for parsing, "
+                            "pass this column using parse_dates instead" % dtype)
         else:
-            if is_datetime64_dtype(dtype):
-                 raise TypeError("the dtype %s is not supported for parsing, "
-                                 "pass this column using parse_dates instead" % dtype)
             raise TypeError("the dtype %s is not supported for parsing" % dtype)
 
     cdef _string_convert(self, Py_ssize_t i, int start, int end,

From da5c5b575d5181255f15f422cb86c24dd30b9aa5 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Thu, 21 Jul 2016 19:55:05 -0500
Subject: [PATCH 07/11] flake8 fix

---
 pandas/io/tests/parser/c_parser_only.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
index a04113e0b3dba..1d8381512a51c 100644
--- a/pandas/io/tests/parser/c_parser_only.py
+++ b/pandas/io/tests/parser/c_parser_only.py
@@ -271,7 +271,7 @@ def test_categorical_dtype_chunksize(self):
                                    'b': Categorical(['a', 'b'])}),
                      pd.DataFrame({'a': [1, 2],
                                    'b': Categorical(['b', 'c'])})]
-        actuals = self.read_csv(StringIO(data), dtype={'b':'category'},
+        actuals = self.read_csv(StringIO(data), dtype={'b': 'category'},
                                 chunksize=2)
 
         for actual, expected in zip(actuals, expecteds):

From 0f0dba63fb4d4aacb37ae4d839cf65eba1658f40 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Wed, 3 Aug 2016 20:06:52 -0500
Subject: [PATCH 08/11] wip

---
 doc/source/io.rst                       |  4 ++--
 doc/source/whatsnew/v0.19.0.txt         |  2 +-
 pandas/io/tests/parser/c_parser_only.py | 12 ++++++------
 pandas/parser.pyx                       | 17 ++++++++++++++++-
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 81fb2871df3c0..c3da848e86856 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -509,7 +509,7 @@ Specifying Categorical dtype
 
 ``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
 
-.. ipython :: python
+.. ipython:: python
 
    data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
 
@@ -525,7 +525,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
 .. note::
 
-   The resulting categories will always be parsed as string (object dtype).
+   The resulting categories will always be parsed as strings (object dtype).
    If the categories are numeric they can be converted using the
    :func:`pd.to_numeric` function, or as appropriate, another converter
    such as :func:`pd.to_datetime`.
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
index db3381d558f1e..f790993d224c4 100644
--- a/doc/source/whatsnew/v0.19.0.txt
+++ b/doc/source/whatsnew/v0.19.0.txt
@@ -258,7 +258,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
 .. note::
 
-   The resulting categories will always be parsed as string (object dtype).
+   The resulting categories will always be parsed as strings (object dtype).
    If the categories are numeric they can be converted using the
    :func:`pd.to_numeric` function, or as appropriate, another converter
    such as :func:`pd.to_datetime`.
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
index 1d8381512a51c..675904dff20c4 100644
--- a/pandas/io/tests/parser/c_parser_only.py
+++ b/pandas/io/tests/parser/c_parser_only.py
@@ -225,8 +225,7 @@ def test_categorical_dtype(self):
 1,b,3.4
 2,a,4.5"""
         expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
-                                 'b': Categorical.from_codes([0, 0, 1],
-                                                             ['b', 'a']),
+                                 'b': Categorical(['b', 'b', 'a']),
                                  'c': Categorical(['3.4', '3.4', '4.5'])})
         actual = self.read_csv(StringIO(data), dtype='category')
         tm.assert_frame_equal(actual, expected)
@@ -237,8 +236,7 @@ def test_categorical_dtype(self):
 1,nan,3.4
 2,a,4.5"""
         expected = pd.DataFrame({'a': Categorical(['1', '1', '2']),
-                                 'b': Categorical.from_codes([0, -1, 1],
-                                                             ['b', 'a']),
+                                 'b': Categorical(['b', np.nan, 'a']),
                                  'c': Categorical(['3.4', '3.4', '4.5'])})
         actual = self.read_csv(StringIO(data), dtype='category')
         tm.assert_frame_equal(actual, expected)
@@ -248,14 +246,15 @@ def test_categorical_dtype_encoding(self):
         pth = tm.get_data_path('unicode_series.csv')
         encoding = 'latin-1'
         expected = self.read_csv(pth, header=None, encoding=encoding)
+        expected[1] = Categorical(expected[1])
         actual = self.read_csv(pth, header=None, encoding=encoding,
                                dtype={1: 'category'})
-        actual[1] = actual[1].astype(object)
         tm.assert_frame_equal(actual, expected)
 
         pth = tm.get_data_path('utf16_ex.txt')
         encoding = 'utf-16'
         expected = self.read_table(pth, encoding=encoding)
+        expected = expected.apply(Categorical)
         actual = self.read_table(pth, encoding=encoding, dtype='category')
         actual = actual.apply(lambda x: x.astype(object))
         tm.assert_frame_equal(actual, expected)
@@ -270,7 +269,8 @@ def test_categorical_dtype_chunksize(self):
         expecteds = [pd.DataFrame({'a': [1, 1],
                                    'b': Categorical(['a', 'b'])}),
                      pd.DataFrame({'a': [1, 2],
-                                   'b': Categorical(['b', 'c'])})]
+                                   'b': Categorical(['b', 'c'])},
+                                  index=[2, 3])]
         actuals = self.read_csv(StringIO(data), dtype={'b': 'category'},
                                 chunksize=2)
 
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index df1d8de0f99b7..629d2f8b812e2 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -40,7 +40,9 @@ from pandas.types.common import (is_categorical_dtype, CategoricalDtype,
                                  is_string_dtype, is_datetime64_dtype,
                                  pandas_dtype)
 from pandas.core.categorical import Categorical
+from pandas.core.algorithms import take_1d
 from pandas.types.concat import union_categoricals
+from pandas import Index
 
 import time
 import os
@@ -1182,6 +1184,19 @@ cdef class TextReader:
             codes, cats, na_count = _categorical_convert(self.parser, i, start,
                                                          end, na_filter, na_hashset,
                                                          self.c_encoding)
+            print cats
+            print codes
+            # sort categories and recode if necessary
+            cats = Index(cats)
+            if not cats.is_monotonic_increasing:
+                unsorted = cats.copy()
+                cats = cats.sort_values()
+                indexer = unsorted.get_indexer(cats)
+                codes = take_1d(indexer, codes, fill_value=-1)
+            print indexer
+            print cats
+            print codes
+
             return Categorical(codes, categories=cats, ordered=False,
                                fastpath=True), na_count
         elif is_object_dtype(dtype):
@@ -2000,7 +2015,7 @@ def _concatenate_chunks(list chunks):
                 warning_columns.append(str(name))
 
         if is_categorical_dtype(dtypes.pop()):
-            result[name] = union_categoricals(arrs)
+            result[name] = union_categoricals(arrs, sort_categories=True)
         else:
             result[name] = np.concatenate(arrs)
 

From 1f6093a0784ed5c9ec926ce999e4dbdc3b239d4c Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Thu, 4 Aug 2016 18:17:11 -0500
Subject: [PATCH 09/11] rebase

---
 pandas/io/tests/parser/c_parser_only.py | 1 -
 pandas/parser.pyx                       | 7 +------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
index 675904dff20c4..4cea9e1d6b595 100644
--- a/pandas/io/tests/parser/c_parser_only.py
+++ b/pandas/io/tests/parser/c_parser_only.py
@@ -256,7 +256,6 @@ def test_categorical_dtype_encoding(self):
         expected = self.read_table(pth, encoding=encoding)
         expected = expected.apply(Categorical)
         actual = self.read_table(pth, encoding=encoding, dtype='category')
-        actual = actual.apply(lambda x: x.astype(object))
         tm.assert_frame_equal(actual, expected)
 
     def test_categorical_dtype_chunksize(self):
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index 629d2f8b812e2..5af82be5b741b 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -1184,18 +1184,13 @@ cdef class TextReader:
             codes, cats, na_count = _categorical_convert(self.parser, i, start,
                                                          end, na_filter, na_hashset,
                                                          self.c_encoding)
-            print cats
-            print codes
             # sort categories and recode if necessary
             cats = Index(cats)
             if not cats.is_monotonic_increasing:
                 unsorted = cats.copy()
                 cats = cats.sort_values()
-                indexer = unsorted.get_indexer(cats)
+                indexer = cats.get_indexer(unsorted)
                 codes = take_1d(indexer, codes, fill_value=-1)
-            print indexer
-            print cats
-            print codes
 
             return Categorical(codes, categories=cats, ordered=False,
                                fastpath=True), na_count

From 75ed6ba0533e7e82db7342060d390194b4137723 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Thu, 4 Aug 2016 18:30:06 -0500
Subject: [PATCH 10/11] doc fixups

---
 doc/source/io.rst               | 4 ++--
 doc/source/whatsnew/v0.19.0.txt | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index c3da848e86856..7917e6b4cdfce 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -527,8 +527,8 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    The resulting categories will always be parsed as strings (object dtype).
    If the categories are numeric they can be converted using the
-   :func:`pd.to_numeric` function, or as appropriate, another converter
-   such as :func:`pd.to_datetime`.
+   :func:`to_numeric` function, or as appropriate, another converter
+   such as :func:`to_datetime`.
 
    .. ipython:: python
 
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
index f790993d224c4..6c995a6989a38 100644
--- a/doc/source/whatsnew/v0.19.0.txt
+++ b/doc/source/whatsnew/v0.19.0.txt
@@ -235,7 +235,7 @@ New behaviour:
 .. _whatsnew_0190.enhancements.read_csv_categorical:
 
 :func:`read_csv` supports parsing ``Categorical`` directly
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The :func:`read_csv` function now supports parsing a ``Categorical`` column when
 specified as a dtype (:issue:`10153`).  Depending on the structure of the data,
@@ -260,8 +260,8 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
 
    The resulting categories will always be parsed as strings (object dtype).
    If the categories are numeric they can be converted using the
-   :func:`pd.to_numeric` function, or as appropriate, another converter
-   such as :func:`pd.to_datetime`.
+   :func:`to_numeric` function, or as appropriate, another converter
+   such as :func:`to_datetime`.
 
    .. ipython:: python
 

From c78f39f982d1a3d788ff57a6a9707979a813e3a7 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Sat, 6 Aug 2016 07:56:48 -0500
Subject: [PATCH 11/11] rebase fixup

---
 pandas/types/concat.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index 40268c37db393..a7fd692cfb9cf 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -240,7 +240,6 @@ def union_categoricals(to_union, sort_categories=False):
         Emmpty list of categoricals passed
     """
     from pandas import Index, Categorical
-    from pandas.core.algorithms import take_1d
 
     if len(to_union) == 0:
         raise ValueError('No Categoricals to union')