From 286d90764bcaceb2a2cde3c637323611439b94b0 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 8 Jun 2016 18:26:28 -0500 Subject: [PATCH 01/11] ENH: parse categoricals in read_csv --- pandas/io/tests/parser/c_parser_only.py | 23 ++- pandas/parser.pyx | 258 ++++++++++++++++++++---- 2 files changed, 242 insertions(+), 39 deletions(-) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 103c9fa2b7ce8..9f264cadfa473 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -12,9 +12,10 @@ import pandas as pd import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex +from pandas import DataFrame, Series, Index, MultiIndex, Categorical from pandas import compat from pandas.compat import StringIO, range, lrange +from pandas.types.dtypes import CategoricalDtype class CParserTests(object): @@ -184,6 +185,26 @@ def test_pass_dtype(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'object') + def test_categorical_dtype(self): + # GH 10153 + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = pd.DataFrame({'a': Categorical([1, 1, 2]), + 'b': Categorical(['a', 'a', 'b']), + 'c': Categorical([3.4, 3.4, 4.5])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'a': 'category', + 'b': 'category', + 'c': CategoricalDtype()}) + tm.assert_frame_equal(actual, expected) + def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( diff --git a/pandas/parser.pyx b/pandas/parser.pyx index e72e2f90a5213..72801c631070b 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -25,6 +25,7 @@ cdef extern from "Python.h": cdef extern from "stdlib.h": void memcpy(void *dst, void *src, size_t n) +cimport cython cimport numpy as cnp from numpy cimport ndarray, uint8_t, uint64_t @@ -33,6 +34,9 @@ import numpy as np cimport util import pandas.lib as lib +from pandas.core.common import is_categorical_dtype, CategoricalDtype +from pandas.core.categorical import Categorical +from pandas.types.concat import union_categoricals import time import os @@ -220,6 +224,24 @@ cdef extern from "parser/tokenizer.h": int to_boolean(const char *item, uint8_t *val) nogil +# XXX +# this is a hack - in order to make the inference +# functions generic (converting either data directly +# from the parser or from a passed in hash table) +# we add an "optional" parameter via fused type, that can either +# be the hash table to parse, or an integer, which is used +# as a sentinel to specialize the function for reading +# from the parser. + +# This is to avoid duplicating a bunch of code or +# adding runtime checks, but may be too much +ctypedef kh_str_t* kh_str_t_p +ctypedef int use_parser_data + +ctypedef fused inference_data_t: + kh_str_t_p + use_parser_data + cdef extern from "parser/io.h": void *new_mmap(char *fname) int del_mmap(void *src) @@ -475,12 +497,17 @@ cdef class TextReader: conv = {} for k in dtype: v = dtype[k] - if isinstance(v, basestring): + if is_categorical_dtype(v): + v = CategoricalDtype() + elif isinstance(v, basestring): v = np.dtype(v) conv[k] = v dtype = conv elif dtype is not None: - dtype = np.dtype(dtype) + if is_categorical_dtype(dtype): + dtype = CategoricalDtype() + else: + dtype = np.dtype(dtype) self.dtype = dtype @@ -1082,7 +1109,7 @@ cdef class TextReader: if col_dtype is not None: if not isinstance(col_dtype, basestring): - if isinstance(col_dtype, np.dtype): + if isinstance(col_dtype, np.dtype) or is_categorical_dtype(col_dtype): col_dtype = col_dtype.str else: col_dtype = np.dtype(col_dtype).str @@ -1138,11 +1165,12 @@ cdef class TextReader: object na_flist): if dtype[1] == 'i' or dtype[1] == 'u': result, na_count = _try_int64(self.parser, i, start, end, - na_filter, na_hashset) + na_filter, na_hashset, + NULL) if user_dtype and na_count is not None: if na_count > 0: raise ValueError("Integer column has NA values in " - "column {column}".format(column=i)) + "column {column}".format(column=i)) if result is not None and dtype[1:] != 'i8': result = result.astype(dtype) @@ -1151,7 +1179,8 @@ cdef class TextReader: elif dtype[1] == 'f': result, na_count = _try_double(self.parser, i, start, end, - na_filter, na_hashset, na_flist) + na_filter, na_hashset, na_flist, + NULL) if result is not None and dtype[1:] != 'f8': result = result.astype(dtype) @@ -1160,7 +1189,8 @@ cdef class TextReader: elif dtype[1] == 'b': result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, - self.true_set, self.false_set) + self.true_set, self.false_set, + NULL) return result, na_count elif dtype[1] == 'c': raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) @@ -1183,8 +1213,15 @@ cdef class TextReader: # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) - - + # is this comparison good enough? + elif dtype == '|O08': + codes, cats, na_count = _categorical_convert(self.parser, i, start, + end, na_filter, na_hashset, + na_flist, self.true_set, + self.false_set, self.c_encoding) + + return Categorical(codes, categories=cats, ordered=False, + fastpath=True), na_count elif dtype[1] == 'O': return self._string_convert(i, start, end, na_filter, na_hashset) @@ -1500,6 +1537,97 @@ cdef _string_box_decode(parser_t *parser, int col, return result, na_count +@cython.boundscheck(False) +cdef _categorical_convert(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + object na_flist, const kh_str_t *true_hashset, + const kh_str_t *false_hashset, + char *encoding): + "Convert column data into codes, categories" + cdef: + int error, na_count = 0 + Py_ssize_t i, size + size_t lines + coliter_t it + const char *word = NULL + int64_t NA = -1 + int64_t[:] codes + size_t current_category = 0 + + char *errors = "strict" + + int ret = 0 + kh_str_t *table + + khiter_t k + + lines = line_end - line_start + codes = np.empty(lines, dtype=np.int64) + with nogil: + table = kh_init_str() + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + COLITER_NEXT(it, word) + + if na_filter: + k = kh_get_str(na_hashset, word) + # is in NA values + if k != na_hashset.n_buckets: + na_count += 1 + codes[i] = NA + continue + + k = kh_get_str(table, word) + # not in the hash table + if k == table.n_buckets: + k = kh_put_str(table, word, &ret) + table.vals[k] = current_category + current_category += 1 + + codes[i] = table.vals[k] + + + # follow the same inference attempts as + # normal data (int64, float64, bool, object) + result, result_na = _try_int64(parser, col, 0, table.n_occupied, + na_filter, na_hashset, table) + if result is None: + result, result_na = _try_double(parser, col, 0, table.n_occupied, + na_filter, na_hashset, na_flist, + table) + if result is None: + # bool categorical doesn't really make sense, but following the + # inference path for now + result, result_na = _try_bool_flex(parser, col, 0, table.n_occupied, + na_filter, na_hashset, true_hashset, + false_hashset, table) + # duplicated logic here, but doesn't make sense to reuse + # other string logic since those paths factorize where we + # already have guaranteed uniques + if result is None: + i = 0 + result = np.empty(table.n_occupied, dtype=np.object_) + if encoding != NULL and encoding != b"utf-8": + for k in range(table.n_buckets): + if kh_exist_str(table, k): + size = strlen(table.keys[k]) + result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors) + i += 1 + elif PY3 or encoding != NULL: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[i] = PyUnicode_FromString(table.keys[k]) + i += 1 + else: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[i] = PyBytes_FromString(table.keys[k]) + i += 1 + + kh_destroy_str(table) + return np.asarray(codes), result, na_count cdef _to_fw_string(parser_t *parser, int col, int line_start, int line_end, size_t width): @@ -1536,12 +1664,12 @@ cdef char* cinf = b'inf' cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' -cdef _try_double(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, object na_flist): +cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, object na_flist, + inference_data_t inference_data): cdef: int error, na_count = 0 size_t i, lines - coliter_t it const char *word = NULL char *p_end double *data @@ -1557,17 +1685,19 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, na_fset = kset_float64_from_list(na_flist) with nogil: error = _try_double_nogil(parser, col, line_start, line_end, - na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) + na_filter, na_hashset, use_na_flist, na_fset, NA, data, + &na_count, inference_data) kh_destroy_float64(na_fset) if error != 0: return None, None return result, na_count + cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, bint use_na_flist, - const kh_float64_t *na_flist, - double NA, - double *data, int *na_count) nogil: + bint na_filter, kh_str_t *na_hashset, bint use_na_flist, + const kh_float64_t *na_flist, + double NA, double *data, int *na_count, + inference_data_t inference_data) nogil: cdef: int error, size_t i @@ -1576,15 +1706,24 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int const char *word = NULL char *p_end khiter_t k, k64 + # only used with passed in data + khiter_t kit = 0 global errno na_count[0] = 0 - coliter_setup(&it, parser, col, line_start) + + # these type checks specialize at compile time + # see typedefs + if inference_data_t is use_parser_data: + coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): - COLITER_NEXT(it, word) + if inference_data_t is use_parser_data: + COLITER_NEXT(it, word) + else: + kit = _htable_next(inference_data, kit, &word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1610,7 +1749,11 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int data += 1 else: for i in range(lines): - COLITER_NEXT(it, word) + if inference_data_t is use_parser_data: + COLITER_NEXT(it, word) + else: + kit = _htable_next(inference_data, kit, &word) + data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: @@ -1626,11 +1769,11 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int return 0 cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset): + bint na_filter, kh_str_t *na_hashset, + inference_data_t inference_data): cdef: int error, na_count = 0 size_t i, lines - coliter_t it int64_t *data ndarray result @@ -1640,9 +1783,10 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, lines = line_end - line_start result = np.empty(lines, dtype=np.int64) data = result.data - coliter_setup(&it, parser, col, line_start) + # compile time with nogil: - error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) + error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, + na_hashset, NA, data, &na_count, inference_data) if error != 0: if error == ERROR_OVERFLOW: # Can't get the word variable @@ -1653,21 +1797,26 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, - int *na_count) nogil: + int *na_count, inference_data_t inference_data) nogil: cdef: int error size_t i size_t lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k + khiter_t k, kit = 0 na_count[0] = 0 - coliter_setup(&it, parser, col, line_start) + # compile time checks + if inference_data_t is use_parser_data: + coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): - COLITER_NEXT(it, word) + if inference_data_t is use_parser_data: + COLITER_NEXT(it, word) + else: + kit = _htable_next(inference_data, kit, &word) k = kh_get_str(na_hashset, word) # in the hash table if k != na_hashset.n_buckets: @@ -1681,7 +1830,10 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int return error else: for i in range(lines): - COLITER_NEXT(it, word) + if inference_data_t is use_parser_data: + COLITER_NEXT(it, word) + else: + kit = _htable_next(inference_data, kit, &word) data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: @@ -1719,6 +1871,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l const char *word = NULL khiter_t k na_count[0] = 0 + coliter_setup(&it, parser, col, line_start) if na_filter: @@ -1749,7 +1902,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, const kh_str_t *false_hashset): + const kh_str_t *true_hashset, const kh_str_t *false_hashset, + inference_data_t inference_data): cdef: int error, na_count = 0 size_t i, lines @@ -1765,8 +1919,9 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, result = np.empty(lines, dtype=np.uint8) data = result.data with nogil: - error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset, - true_hashset, false_hashset, NA, data, &na_count) + error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, + na_hashset, true_hashset, false_hashset, NA, data, + &na_count, inference_data) if error != 0: return None, None return result.view(np.bool_), na_count @@ -1774,21 +1929,27 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset, - uint8_t NA, uint8_t *data, int *na_count) nogil: + uint8_t NA, uint8_t *data, int *na_count, + inference_data_t inference_data) nogil: cdef: int error = 0 size_t i size_t lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k + khiter_t k, kit = 0 na_count[0] = 0 - coliter_setup(&it, parser, col, line_start) + # compile time + if inference_data_t is use_parser_data: + coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): - COLITER_NEXT(it, word) + if inference_data_t is use_parser_data: + COLITER_NEXT(it, word) + else: + kit = _htable_next(inference_data, kit, &word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1815,7 +1976,10 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, data += 1 else: for i in range(lines): - COLITER_NEXT(it, word) + if inference_data_t is use_parser_data: + COLITER_NEXT(it, word) + else: + kit = _htable_next(inference_data, kit, &word) k = kh_get_str(true_hashset, word) if k != true_hashset.n_buckets: @@ -1836,6 +2000,19 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, return 0 + +cdef inline khiter_t _htable_next(kh_str_t *table, khiter_t k, char **word) nogil: + """given starting iterator, asssign next valid key to word and return + the next iterator""" + while k < table.n_buckets: + if kh_exist_str(table, k): + break + k += 1 + + word[0] = table.keys[k] + return (k + 1) + + cdef kh_str_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: @@ -1924,7 +2101,12 @@ def _concatenate_chunks(list chunks): common_type = np.find_common_type(dtypes, []) if common_type == np.object: warning_columns.append(str(name)) - result[name] = np.concatenate(arrs) + + if is_categorical_dtype(dtypes.pop()): + result[name] = union_categoricals(arrs) + #np.concatenate([c.codes for c in arrs]) + else: + result[name] = np.concatenate(arrs) if warning_columns: warning_names = ','.join(warning_columns) From cfa0ce402e5b18fb992b7a56325194faa17e9742 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 11 Jun 2016 06:30:46 -0500 Subject: [PATCH 02/11] clean up dtype checking, add function specialization --- pandas/parser.pyx | 105 ++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 59 deletions(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 72801c631070b..f67c7ed6ad85f 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -34,7 +34,10 @@ import numpy as np cimport util import pandas.lib as lib -from pandas.core.common import is_categorical_dtype, CategoricalDtype +from pandas.core.common import (is_categorical_dtype, CategoricalDtype, + is_integer_dtype, is_float_dtype, + is_bool_dtype, is_object_dtype, + is_string_dtype, is_datetime64_dtype) from pandas.core.categorical import Categorical from pandas.types.concat import union_categoricals @@ -224,19 +227,13 @@ cdef extern from "parser/tokenizer.h": int to_boolean(const char *item, uint8_t *val) nogil -# XXX -# this is a hack - in order to make the inference -# functions generic (converting either data directly -# from the parser or from a passed in hash table) -# we add an "optional" parameter via fused type, that can either -# be the hash table to parse, or an integer, which is used -# as a sentinel to specialize the function for reading -# from the parser. -# This is to avoid duplicating a bunch of code or -# adding runtime checks, but may be too much +# to make the inference functions generic +# add an optional last parameter that is +# the source of data to be used +# other than the parser_t ctypedef kh_str_t* kh_str_t_p -ctypedef int use_parser_data +ctypedef void* use_parser_data ctypedef fused inference_data_t: kh_str_t_p @@ -421,11 +418,12 @@ cdef class TextReader: self._set_quoting(quotechar, quoting) - # TODO: endianness just a placeholder? + + dtype_order = ['int64', 'float64', 'bool', 'object'] if quoting == QUOTE_NONNUMERIC: - self.dtype_cast_order = [' 1: @@ -1108,12 +1106,6 @@ cdef class TextReader: col_dtype = self.dtype if col_dtype is not None: - if not isinstance(col_dtype, basestring): - if isinstance(col_dtype, np.dtype) or is_categorical_dtype(col_dtype): - col_dtype = col_dtype.str - else: - col_dtype = np.dtype(col_dtype).str - col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end, na_filter, 1, na_hashset, na_flist) @@ -1131,7 +1123,7 @@ cdef class TextReader: dt, i, start, end, na_filter, 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( - '|O8', i, start, end, na_filter, 0, na_hashset, na_flist) + np.dtype('object'), i, start, end, na_filter, 0, na_hashset, na_flist) if col_res is not None: break @@ -1163,41 +1155,38 @@ cdef class TextReader: bint user_dtype, kh_str_t *na_hashset, object na_flist): - if dtype[1] == 'i' or dtype[1] == 'u': - result, na_count = _try_int64(self.parser, i, start, end, - na_filter, na_hashset, - NULL) + if is_integer_dtype(dtype): + result, na_count = _try_int64[use_parser_data](self.parser, i, + start, end, na_filter, + na_hashset, NULL) if user_dtype and na_count is not None: if na_count > 0: raise ValueError("Integer column has NA values in " "column {column}".format(column=i)) - if result is not None and dtype[1:] != 'i8': + if result is not None and dtype != 'int64': result = result.astype(dtype) return result, na_count - elif dtype[1] == 'f': - result, na_count = _try_double(self.parser, i, start, end, - na_filter, na_hashset, na_flist, - NULL) + elif is_float_dtype(dtype): + result, na_count = _try_double[use_parser_data](self.parser, i, start, end, + na_filter, na_hashset, na_flist, + NULL) - if result is not None and dtype[1:] != 'f8': + if result is not None and dtype != 'float64': result = result.astype(dtype) return result, na_count - elif dtype[1] == 'b': - result, na_count = _try_bool_flex(self.parser, i, start, end, - na_filter, na_hashset, - self.true_set, self.false_set, - NULL) + elif is_bool_dtype(dtype): + result, na_count = _try_bool_flex[use_parser_data](self.parser, i, start, end, + na_filter, na_hashset, + self.true_set, self.false_set, + NULL) return result, na_count - elif dtype[1] == 'c': - raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) - - elif dtype[1] == 'S': + elif dtype.kind == 'S': # TODO: na handling - width = int(dtype[2:]) + width = dtype.itemsize if width > 0: result = _to_fw_string(self.parser, i, start, end, width) return result, 0 @@ -1205,8 +1194,8 @@ cdef class TextReader: # treat as a regular string parsing return self._string_convert(i, start, end, na_filter, na_hashset) - elif dtype[1] == 'U': - width = int(dtype[2:]) + elif dtype.kind == 'U': + width = dtype.itemsize if width > 0: raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) @@ -1214,19 +1203,18 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) # is this comparison good enough? - elif dtype == '|O08': + elif is_categorical_dtype(dtype): codes, cats, na_count = _categorical_convert(self.parser, i, start, end, na_filter, na_hashset, na_flist, self.true_set, self.false_set, self.c_encoding) - return Categorical(codes, categories=cats, ordered=False, fastpath=True), na_count - elif dtype[1] == 'O': + elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) else: - if dtype[1] == 'M': + if is_datetime64_dtype(dtype): raise TypeError("the dtype %s is not supported for parsing, " "pass this column using parse_dates instead" % dtype) raise TypeError("the dtype %s is not supported for parsing" % dtype) @@ -1588,7 +1576,7 @@ cdef _categorical_convert(parser_t *parser, int col, codes[i] = table.vals[k] - + # Codes are complete, now inference on cats # follow the same inference attempts as # normal data (int64, float64, bool, object) result, result_na = _try_int64(parser, col, 0, table.n_occupied, @@ -1603,9 +1591,10 @@ cdef _categorical_convert(parser_t *parser, int col, result, result_na = _try_bool_flex(parser, col, 0, table.n_occupied, na_filter, na_hashset, true_hashset, false_hashset, table) - # duplicated logic here, but doesn't make sense to reuse - # other string logic since those paths factorize where we - # already have guaranteed uniques + + # if no numeric types parsed, convert to object. + # Note that the decoding path logic should sync up with that + # of `TextReader.string_convert` if result is None: i = 0 result = np.empty(table.n_occupied, dtype=np.object_) @@ -1694,10 +1683,10 @@ cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end, cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, bint use_na_flist, - const kh_float64_t *na_flist, - double NA, double *data, int *na_count, - inference_data_t inference_data) nogil: + bint na_filter, kh_str_t *na_hashset, bint use_na_flist, + const kh_float64_t *na_flist, + double NA, double *data, int *na_count, + inference_data_t inference_data) nogil: cdef: int error, size_t i @@ -1783,7 +1772,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, lines = line_end - line_start result = np.empty(lines, dtype=np.int64) data = result.data - # compile time with nogil: error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count, inference_data) @@ -2104,7 +2092,6 @@ def _concatenate_chunks(list chunks): if is_categorical_dtype(dtypes.pop()): result[name] = union_categoricals(arrs) - #np.concatenate([c.codes for c in arrs]) else: result[name] = np.concatenate(arrs) From 849a112a37ee3cb0f9e3cd164ae17fd589343c98 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 12 Jun 2016 08:33:07 -0500 Subject: [PATCH 03/11] fix some dtype checking --- pandas/parser.pyx | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index f67c7ed6ad85f..84d2a52740845 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -38,6 +38,7 @@ from pandas.core.common import (is_categorical_dtype, CategoricalDtype, is_integer_dtype, is_float_dtype, is_bool_dtype, is_object_dtype, is_string_dtype, is_datetime64_dtype) +from pandas.types.api import pandas_dtype from pandas.core.categorical import Categorical from pandas.types.concat import union_categoricals @@ -492,22 +493,13 @@ cdef class TextReader: self.encoding = encoding if isinstance(dtype, dict): - conv = {} - for k in dtype: - v = dtype[k] - if is_categorical_dtype(v): - v = CategoricalDtype() - elif isinstance(v, basestring): - v = np.dtype(v) - conv[k] = v - dtype = conv + dtype = {k: pandas_dtype(dtype[k]) + for k in dtype} elif dtype is not None: - if is_categorical_dtype(dtype): - dtype = CategoricalDtype() - else: - dtype = np.dtype(dtype) + dtype = pandas_dtype(dtype) self.dtype = dtype + print dtype # XXX self.noconvert = set() @@ -1101,7 +1093,8 @@ cdef class TextReader: col_dtype = self.dtype[i] else: if self.dtype.names: - col_dtype = self.dtype.descr[i][1] + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) else: col_dtype = self.dtype @@ -1202,7 +1195,6 @@ cdef class TextReader: # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) - # is this comparison good enough? elif is_categorical_dtype(dtype): codes, cats, na_count = _categorical_convert(self.parser, i, start, end, na_filter, na_hashset, From 4e0722d063aad3622eb23d17408eb0c03b81995f Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 16 Jul 2016 12:06:40 -0500 Subject: [PATCH 04/11] undo type inference add docs and asv --- asv_bench/benchmarks/parser_vb.py | 21 ++ doc/source/io.rst | 34 +++ doc/source/whatsnew/v0.19.0.txt | 46 ++++ pandas/io/tests/parser/c_parser_only.py | 24 ++- pandas/parser.pyx | 274 +++++++++--------------- 5 files changed, 220 insertions(+), 179 deletions(-) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 04f25034638cd..614bfa5ec5d6e 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -114,6 +114,27 @@ def teardown(self): os.remove('test.csv') +class read_csv_categorical(object): + def setup(self): + goal_time = 0.2 + + N = 100000 + group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] + df = DataFrame({'a': np.random.choice(group1, N).astype('object'), + 'b': np.random.choice(group1, N).astype('object'), + 'c': np.random.choice(group1, N).astype('object')}) + df.to_csv('strings.csv', index=False) + + def time_read_csv_categorical_post(self): + read_csv('strings.csv').apply(pd.Categorical) + + def time_read_csv_categorical_direct(self): + read_csv('strings.csv', dtype='category') + + def teardown(self): + os.remove('strings.csv') + + class read_table_multiple_date(object): goal_time = 0.2 diff --git a/doc/source/io.rst b/doc/source/io.rst index 2866371cce61a..4ff6a79a3ef64 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -500,6 +500,40 @@ worth trying. data that was read in. It is important to note that the overall column will be marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. +Specifying Categorical dtype +'''''''''''''''''''''''''''' + +.. versionadded:: 0.19.0 + +`Categorical` columns can be parsed directly by specifying `dtype='category'` + +.. ipython :: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data)).dtypes + pd.read_csv(StringIO(data), dtype='category').dtypes + +Individual columns can be parsed as a `Categorical` using a dict specification + +.. ipython :: python + + pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + +.. note:: + + The resulting categories will always be parsed as string (object dtype). + Numeric categories can be converted using the :func:`pd.to_numeric` function. + + .. ipython :: python + + df = pd.read_csv(StringIO(data), dtype='category') + df.dtypes + df['col3'] + df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) + df['col3'] +>>>>>>> undo type inference add docs and asv Naming and Using Columns diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 59a106291dad8..9d51da1233dd8 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -195,6 +195,14 @@ default of the index) in a DataFrame. :func:`read_csv` has improved support for duplicate column names ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: + + :ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) @@ -222,6 +230,44 @@ New behaviour: In [2]: pd.read_csv(StringIO(data), names=names) + +.. _whatsnew_0190.enhancements.read_csv_categorical: + +:func:`read_csv` supports parsing `Categorical` directly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`read_csv` function now supports parsing a `Categorical` column when +specified as a dtype (:issue:`10153`). Depending on the structure of the data, +this can result in a faster parse time and lower memory usage, compared to +converting to `Categorical` after parsing. + +.. ipython :: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data)).dtypes + pd.read_csv(StringIO(data), dtype='category').dtypes + +Individual columns can be parsed as a `Categorical` using a dict specification + +.. ipython :: python + + pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + +.. note:: + + The resulting categories will always be parsed as string (object dtype). + Numeric categories can be converted using the :func:`pd.to_numeric` function. + + .. ipython :: python + + df = pd.read_csv(StringIO(data), dtype='category') + df.dtypes + df['col3'] + df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) + df['col3'] + .. _whatsnew_0190.enhancements.semi_month_offsets: Semi-Month Offsets diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 9f264cadfa473..0d706a2fe2c4b 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -191,9 +191,9 @@ def test_categorical_dtype(self): 1,a,3.4 1,a,3.4 2,b,4.5""" - expected = pd.DataFrame({'a': Categorical([1, 1, 2]), + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), 'b': Categorical(['a', 'a', 'b']), - 'c': Categorical([3.4, 3.4, 4.5])}) + 'c': Categorical(['3.4', '3.4', '4.5'])}) actual = self.read_csv(StringIO(data), dtype='category') tm.assert_frame_equal(actual, expected) @@ -205,6 +205,26 @@ def test_categorical_dtype(self): 'c': CategoricalDtype()}) tm.assert_frame_equal(actual, expected) + actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) + expected = pd.DataFrame({'a': [1, 1, 2], + 'b': Categorical(['a', 'a', 'b']), + 'c': [3.4, 3.4, 4.5]}) + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_encoding(self): + # GH 10153 + cases = [ + ('unicode_series.csv', 'latin-1'), + ('utf16_ex.txt', 'utf-16') + ] + + for f, encoding in cases: + pth = tm.get_data_path(f) + expected = self.read_csv(pth, header=None, encoding=encoding) + result = self.read_csv(pth, header=None, encoding=encoding, dtype='category') + result = result.apply(lambda x: x.astype(object)) + tm.assert_frame_equal(actual, expected) + def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 84d2a52740845..e680cab5ff90c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -34,11 +34,11 @@ import numpy as np cimport util import pandas.lib as lib -from pandas.core.common import (is_categorical_dtype, CategoricalDtype, - is_integer_dtype, is_float_dtype, - is_bool_dtype, is_object_dtype, - is_string_dtype, is_datetime64_dtype) -from pandas.types.api import pandas_dtype +from pandas.types.common import (is_categorical_dtype, CategoricalDtype, + is_integer_dtype, is_float_dtype, + is_bool_dtype, is_object_dtype, + is_string_dtype, is_datetime64_dtype, + pandas_dtype) from pandas.core.categorical import Categorical from pandas.types.concat import union_categoricals @@ -228,18 +228,6 @@ cdef extern from "parser/tokenizer.h": int to_boolean(const char *item, uint8_t *val) nogil - -# to make the inference functions generic -# add an optional last parameter that is -# the source of data to be used -# other than the parser_t -ctypedef kh_str_t* kh_str_t_p -ctypedef void* use_parser_data - -ctypedef fused inference_data_t: - kh_str_t_p - use_parser_data - cdef extern from "parser/io.h": void *new_mmap(char *fname) int del_mmap(void *src) @@ -499,7 +487,6 @@ cdef class TextReader: dtype = pandas_dtype(dtype) self.dtype = dtype - print dtype # XXX self.noconvert = set() @@ -706,6 +693,7 @@ cdef class TextReader: int status Py_ssize_t size char *errors = "strict" + cdef StringPath path = _string_path(self.c_encoding) header = [] @@ -735,20 +723,18 @@ cdef class TextReader: field_count = self.parser.line_fields[hr] start = self.parser.line_start[hr] - # TODO: Py3 vs. Py2 counts = {} unnamed_count = 0 for i in range(field_count): word = self.parser.words[start + i] - if self.c_encoding == NULL and not PY3: + if path == CSTRING: name = PyBytes_FromString(word) - else: - if self.c_encoding == NULL or self.c_encoding == b'utf-8': - name = PyUnicode_FromString(word) - else: - name = PyUnicode_Decode(word, strlen(word), - self.c_encoding, errors) + elif path == UTF8: + name = PyUnicode_FromString(word) + elif path == ENCODED: + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, errors) if name == '': if self.has_mi_columns: @@ -1149,9 +1135,8 @@ cdef class TextReader: kh_str_t *na_hashset, object na_flist): if is_integer_dtype(dtype): - result, na_count = _try_int64[use_parser_data](self.parser, i, - start, end, na_filter, - na_hashset, NULL) + result, na_count = _try_int64(self.parser, i, start, end, na_filter, + na_hashset) if user_dtype and na_count is not None: if na_count > 0: raise ValueError("Integer column has NA values in " @@ -1163,19 +1148,17 @@ cdef class TextReader: return result, na_count elif is_float_dtype(dtype): - result, na_count = _try_double[use_parser_data](self.parser, i, start, end, - na_filter, na_hashset, na_flist, - NULL) + result, na_count = _try_double(self.parser, i, start, end, + na_filter, na_hashset, na_flist) if result is not None and dtype != 'float64': result = result.astype(dtype) return result, na_count elif is_bool_dtype(dtype): - result, na_count = _try_bool_flex[use_parser_data](self.parser, i, start, end, - na_filter, na_hashset, - self.true_set, self.false_set, - NULL) + result, na_count = _try_bool_flex(self.parser, i, start, end, + na_filter, na_hashset, + self.true_set, self.false_set) return result, na_count elif dtype.kind == 'S': # TODO: na handling @@ -1186,7 +1169,7 @@ cdef class TextReader: # treat as a regular string parsing return self._string_convert(i, start, end, na_filter, - na_hashset) + na_hashset) elif dtype.kind == 'U': width = dtype.itemsize if width > 0: @@ -1198,8 +1181,7 @@ cdef class TextReader: elif is_categorical_dtype(dtype): codes, cats, na_count = _categorical_convert(self.parser, i, start, end, na_filter, na_hashset, - na_flist, self.true_set, - self.false_set, self.c_encoding) + self.c_encoding) return Categorical(codes, categories=cats, ordered=False, fastpath=True), na_count elif is_object_dtype(dtype): @@ -1213,30 +1195,19 @@ cdef class TextReader: cdef _string_convert(self, Py_ssize_t i, int start, int end, bint na_filter, kh_str_t *na_hashset): - if PY3: - if self.c_encoding != NULL: - if self.c_encoding == b"utf-8": - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - return _string_box_decode(self.parser, i, start, end, - na_filter, na_hashset, - self.c_encoding) - else: - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - if self.c_encoding != NULL: - if self.c_encoding == b"utf-8": - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - return _string_box_decode(self.parser, i, start, end, - na_filter, na_hashset, - self.c_encoding) - else: - return _string_box_factorize(self.parser, i, start, end, - na_filter, na_hashset) + + cdef StringPath path = _string_path(self.c_encoding) + + if path == UTF8: + return _string_box_utf8(self.parser, i, start, end, na_filter, + na_hashset) + elif path == ENCODED: + return _string_box_decode(self.parser, i, start, end, + na_filter, na_hashset, self.c_encoding) + elif path == CSTRING: + return _string_box_factorize(self.parser, i, start, end, + na_filter, na_hashset) + def _get_converter(self, i, name): if self.converters is None: @@ -1348,6 +1319,19 @@ def _maybe_upcast(arr): return arr +cdef enum StringPath: + CSTRING + UTF8 + ENCODED + +# factored out logic to pick string converter +cdef inline StringPath _string_path(char *encoding): + if encoding != NULL and encoding != b"utf-8": + return ENCODED + elif PY3 or encoding != NULL: + return UTF8 + else: + return CSTRING # ---------------------------------------------------------------------- # Type conversions / inference support code @@ -1521,8 +1505,6 @@ cdef _string_box_decode(parser_t *parser, int col, cdef _categorical_convert(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset, - object na_flist, const kh_str_t *true_hashset, - const kh_str_t *false_hashset, char *encoding): "Convert column data into codes, categories" cdef: @@ -1531,19 +1513,22 @@ cdef _categorical_convert(parser_t *parser, int col, size_t lines coliter_t it const char *word = NULL + int64_t NA = -1 int64_t[:] codes - size_t current_category = 0 + int64_t current_category = 0 char *errors = "strict" + cdef StringPath path = _string_path(encoding) int ret = 0 kh_str_t *table - khiter_t k lines = line_end - line_start codes = np.empty(lines, dtype=np.int64) + # factorize parsed values, creating a hash table + # bytes -> category with nogil: table = kh_init_str() coliter_setup(&it, parser, col, line_start) @@ -1568,44 +1553,25 @@ cdef _categorical_convert(parser_t *parser, int col, codes[i] = table.vals[k] - # Codes are complete, now inference on cats - # follow the same inference attempts as - # normal data (int64, float64, bool, object) - result, result_na = _try_int64(parser, col, 0, table.n_occupied, - na_filter, na_hashset, table) - if result is None: - result, result_na = _try_double(parser, col, 0, table.n_occupied, - na_filter, na_hashset, na_flist, - table) - if result is None: - # bool categorical doesn't really make sense, but following the - # inference path for now - result, result_na = _try_bool_flex(parser, col, 0, table.n_occupied, - na_filter, na_hashset, true_hashset, - false_hashset, table) - - # if no numeric types parsed, convert to object. - # Note that the decoding path logic should sync up with that - # of `TextReader.string_convert` - if result is None: - i = 0 - result = np.empty(table.n_occupied, dtype=np.object_) - if encoding != NULL and encoding != b"utf-8": - for k in range(table.n_buckets): - if kh_exist_str(table, k): - size = strlen(table.keys[k]) - result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors) - i += 1 - elif PY3 or encoding != NULL: - for k in range(table.n_buckets): - if kh_exist_str(table, k): - result[i] = PyUnicode_FromString(table.keys[k]) - i += 1 - else: - for k in range(table.n_buckets): - if kh_exist_str(table, k): - result[i] = PyBytes_FromString(table.keys[k]) - i += 1 + # parse and box categories to python strings + i = 0 + result = np.empty(table.n_occupied, dtype=np.object_) + if path == ENCODED: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + size = strlen(table.keys[k]) + result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors) + i += 1 + elif path == UTF8: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[i] = PyUnicode_FromString(table.keys[k]) + i += 1 + elif path == CSTRING: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[i] = PyBytes_FromString(table.keys[k]) + i += 1 kh_destroy_str(table) return np.asarray(codes), result, na_count @@ -1645,12 +1611,12 @@ cdef char* cinf = b'inf' cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' -cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, object na_flist, - inference_data_t inference_data): +cdef _try_double(parser_t *parser, int col, int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, object na_flist): cdef: int error, na_count = 0 size_t i, lines + coliter_t it const char *word = NULL char *p_end double *data @@ -1666,19 +1632,17 @@ cdef inline _try_double(parser_t *parser, int col, int line_start, int line_end, na_fset = kset_float64_from_list(na_flist) with nogil: error = _try_double_nogil(parser, col, line_start, line_end, - na_filter, na_hashset, use_na_flist, na_fset, NA, data, - &na_count, inference_data) + na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) kh_destroy_float64(na_fset) if error != 0: return None, None return result, na_count - cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset, bint use_na_flist, const kh_float64_t *na_flist, - double NA, double *data, int *na_count, - inference_data_t inference_data) nogil: + double NA, + double *data, int *na_count) nogil: cdef: int error, size_t i @@ -1687,24 +1651,15 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int const char *word = NULL char *p_end khiter_t k, k64 - # only used with passed in data - khiter_t kit = 0 global errno na_count[0] = 0 - - # these type checks specialize at compile time - # see typedefs - if inference_data_t is use_parser_data: - coliter_setup(&it, parser, col, line_start) + coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): - if inference_data_t is use_parser_data: - COLITER_NEXT(it, word) - else: - kit = _htable_next(inference_data, kit, &word) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1730,11 +1685,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int data += 1 else: for i in range(lines): - if inference_data_t is use_parser_data: - COLITER_NEXT(it, word) - else: - kit = _htable_next(inference_data, kit, &word) - + COLITER_NEXT(it, word) data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: @@ -1750,11 +1701,11 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int return 0 cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, - inference_data_t inference_data): + bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 size_t i, lines + coliter_t it int64_t *data ndarray result @@ -1764,9 +1715,9 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, lines = line_end - line_start result = np.empty(lines, dtype=np.int64) data = result.data + coliter_setup(&it, parser, col, line_start) with nogil: - error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, - na_hashset, NA, data, &na_count, inference_data) + error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) if error != 0: if error == ERROR_OVERFLOW: # Can't get the word variable @@ -1777,26 +1728,21 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, - int *na_count, inference_data_t inference_data) nogil: + int *na_count) nogil: cdef: int error size_t i size_t lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k, kit = 0 + khiter_t k na_count[0] = 0 - # compile time checks - if inference_data_t is use_parser_data: - coliter_setup(&it, parser, col, line_start) + coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): - if inference_data_t is use_parser_data: - COLITER_NEXT(it, word) - else: - kit = _htable_next(inference_data, kit, &word) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table if k != na_hashset.n_buckets: @@ -1810,10 +1756,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int return error else: for i in range(lines): - if inference_data_t is use_parser_data: - COLITER_NEXT(it, word) - else: - kit = _htable_next(inference_data, kit, &word) + COLITER_NEXT(it, word) data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: @@ -1882,8 +1825,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, const kh_str_t *false_hashset, - inference_data_t inference_data): + const kh_str_t *true_hashset, const kh_str_t *false_hashset): cdef: int error, na_count = 0 size_t i, lines @@ -1899,9 +1841,8 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, result = np.empty(lines, dtype=np.uint8) data = result.data with nogil: - error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, - na_hashset, true_hashset, false_hashset, NA, data, - &na_count, inference_data) + error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset, + true_hashset, false_hashset, NA, data, &na_count) if error != 0: return None, None return result.view(np.bool_), na_count @@ -1909,27 +1850,21 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset, - uint8_t NA, uint8_t *data, int *na_count, - inference_data_t inference_data) nogil: + uint8_t NA, uint8_t *data, int *na_count) nogil: cdef: int error = 0 size_t i size_t lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k, kit = 0 + khiter_t k na_count[0] = 0 - # compile time - if inference_data_t is use_parser_data: - coliter_setup(&it, parser, col, line_start) + coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): - if inference_data_t is use_parser_data: - COLITER_NEXT(it, word) - else: - kit = _htable_next(inference_data, kit, &word) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1956,10 +1891,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, data += 1 else: for i in range(lines): - if inference_data_t is use_parser_data: - COLITER_NEXT(it, word) - else: - kit = _htable_next(inference_data, kit, &word) + COLITER_NEXT(it, word) k = kh_get_str(true_hashset, word) if k != true_hashset.n_buckets: @@ -1981,18 +1913,6 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, return 0 -cdef inline khiter_t _htable_next(kh_str_t *table, khiter_t k, char **word) nogil: - """given starting iterator, asssign next valid key to word and return - the next iterator""" - while k < table.n_buckets: - if kh_exist_str(table, k): - break - k += 1 - - word[0] = table.keys[k] - return (k + 1) - - cdef kh_str_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: From 249094918be25a4b021806744fe2cd8f62389ead Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 16 Jul 2016 13:59:31 -0500 Subject: [PATCH 05/11] fix hash table ordering, null categories --- asv_bench/benchmarks/parser_vb.py | 4 +- pandas/io/tests/parser/c_parser_only.py | 52 +++++++++++++++++++------ pandas/parser.pyx | 13 +++---- pandas/tools/tests/test_concat.py | 3 ++ pandas/types/concat.py | 1 + 5 files changed, 52 insertions(+), 21 deletions(-) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 614bfa5ec5d6e..6dc8bffd6dac9 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -115,9 +115,9 @@ def teardown(self): class read_csv_categorical(object): - def setup(self): - goal_time = 0.2 + goal_time = 0.2 + def setup(self): N = 100000 group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] df = DataFrame({'a': np.random.choice(group1, N).astype('object'), diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 0d706a2fe2c4b..5ab2b6406ad6d 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -211,19 +211,49 @@ def test_categorical_dtype(self): 'c': [3.4, 3.4, 4.5]}) tm.assert_frame_equal(actual, expected) + actual = self.read_csv(StringIO(data), dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + # unsorted + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical.from_codes([0, 0, 1], + ['b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + # missing + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical.from_codes([0, -1, 1], + ['b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + def test_categorical_dtype_encoding(self): # GH 10153 - cases = [ - ('unicode_series.csv', 'latin-1'), - ('utf16_ex.txt', 'utf-16') - ] - - for f, encoding in cases: - pth = tm.get_data_path(f) - expected = self.read_csv(pth, header=None, encoding=encoding) - result = self.read_csv(pth, header=None, encoding=encoding, dtype='category') - result = result.apply(lambda x: x.astype(object)) - tm.assert_frame_equal(actual, expected) + pth = tm.get_data_path('unicode_series.csv') + encoding = 'latin-1' + expected = self.read_csv(pth, header=None, encoding=encoding) + actual = self.read_csv(pth, header=None, encoding=encoding, + dtype={1: 'category'}) + actual[1] = actual[1].astype(object) + tm.assert_frame_equal(actual, expected) + + pth = tm.get_data_path('utf16_ex.txt') + encoding = 'utf-16' + expected = self.read_table(pth, encoding=encoding) + actual = self.read_table(pth, encoding=encoding, dtype='category') + actual = actual.apply(lambda x: x.astype(object)) + tm.assert_frame_equal(actual, expected) def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: diff --git a/pandas/parser.pyx b/pandas/parser.pyx index e680cab5ff90c..3809c82654312 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1527,8 +1527,9 @@ cdef _categorical_convert(parser_t *parser, int col, lines = line_end - line_start codes = np.empty(lines, dtype=np.int64) + # factorize parsed values, creating a hash table - # bytes -> category + # bytes -> category code with nogil: table = kh_init_str() coliter_setup(&it, parser, col, line_start) @@ -1554,24 +1555,20 @@ cdef _categorical_convert(parser_t *parser, int col, codes[i] = table.vals[k] # parse and box categories to python strings - i = 0 result = np.empty(table.n_occupied, dtype=np.object_) if path == ENCODED: for k in range(table.n_buckets): if kh_exist_str(table, k): size = strlen(table.keys[k]) - result[i] = PyUnicode_Decode(table.keys[k], size, encoding, errors) - i += 1 + result[table.vals[k]] = PyUnicode_Decode(table.keys[k], size, encoding, errors) elif path == UTF8: for k in range(table.n_buckets): if kh_exist_str(table, k): - result[i] = PyUnicode_FromString(table.keys[k]) - i += 1 + result[table.vals[k]] = PyUnicode_FromString(table.keys[k]) elif path == CSTRING: for k in range(table.n_buckets): if kh_exist_str(table, k): - result[i] = PyBytes_FromString(table.keys[k]) - i += 1 + result[table.vals[k]] = PyBytes_FromString(table.keys[k]) kh_destroy_str(table) return np.asarray(codes), result, na_count diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 225ba533161b3..e3cc60e2856c2 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -850,6 +850,9 @@ def test_union_categorical(self): ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], + ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), + (pd.date_range('2014-01-01', '2014-01-05'), pd.date_range('2014-01-06', '2014-01-07'), pd.date_range('2014-01-01', '2014-01-07')), diff --git a/pandas/types/concat.py b/pandas/types/concat.py index a7fd692cfb9cf..40268c37db393 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -240,6 +240,7 @@ def union_categoricals(to_union, sort_categories=False): Emmpty list of categoricals passed """ from pandas import Index, Categorical + from pandas.core.algorithms import take_1d if len(to_union) == 0: raise ValueError('No Categoricals to union') From 12547687bc16924c898d0ae246670e2453c6cc04 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 21 Jul 2016 19:50:59 -0500 Subject: [PATCH 06/11] doc fixups; addl tests --- doc/source/io.rst | 15 +++++++++------ doc/source/whatsnew/v0.19.0.txt | 21 ++++++++++++--------- pandas/io/tests/parser/c_parser_only.py | 22 ++++++++++++++++++++++ pandas/parser.pyx | 8 ++++---- 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 4ff6a79a3ef64..81fb2871df3c0 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -500,12 +500,14 @@ worth trying. data that was read in. It is important to note that the overall column will be marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. +.. _io.categorical: + Specifying Categorical dtype '''''''''''''''''''''''''''' .. versionadded:: 0.19.0 -`Categorical` columns can be parsed directly by specifying `dtype='category'` +``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` .. ipython :: python @@ -515,25 +517,26 @@ Specifying Categorical dtype pd.read_csv(StringIO(data)).dtypes pd.read_csv(StringIO(data), dtype='category').dtypes -Individual columns can be parsed as a `Categorical` using a dict specification +Individual columns can be parsed as a ``Categorical`` using a dict specification -.. ipython :: python +.. ipython:: python pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes .. note:: The resulting categories will always be parsed as string (object dtype). - Numeric categories can be converted using the :func:`pd.to_numeric` function. + If the categories are numeric they can be converted using the + :func:`pd.to_numeric` function, or as appropriate, another converter + such as :func:`pd.to_datetime`. - .. ipython :: python + .. ipython:: python df = pd.read_csv(StringIO(data), dtype='category') df.dtypes df['col3'] df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) df['col3'] ->>>>>>> undo type inference add docs and asv Naming and Using Columns diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9d51da1233dd8..db3381d558f1e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -12,6 +12,7 @@ Highlights include: - :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - ``.rolling()`` are now time-series aware, see :ref:`here ` - pandas development api, see :ref:`here ` +- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` .. contents:: What's new in v0.19.0 :local: @@ -233,15 +234,15 @@ New behaviour: .. _whatsnew_0190.enhancements.read_csv_categorical: -:func:`read_csv` supports parsing `Categorical` directly +:func:`read_csv` supports parsing ``Categorical`` directly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The :func:`read_csv` function now supports parsing a `Categorical` column when +The :func:`read_csv` function now supports parsing a ``Categorical`` column when specified as a dtype (:issue:`10153`). Depending on the structure of the data, -this can result in a faster parse time and lower memory usage, compared to -converting to `Categorical` after parsing. +this can result in a faster parse time and lower memory usage compared to +converting to ``Categorical`` after parsing. See the io :ref:`docs here ` -.. ipython :: python +.. ipython:: python data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' @@ -249,18 +250,20 @@ converting to `Categorical` after parsing. pd.read_csv(StringIO(data)).dtypes pd.read_csv(StringIO(data), dtype='category').dtypes -Individual columns can be parsed as a `Categorical` using a dict specification +Individual columns can be parsed as a ``Categorical`` using a dict specification -.. ipython :: python +.. ipython:: python pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes .. note:: The resulting categories will always be parsed as string (object dtype). - Numeric categories can be converted using the :func:`pd.to_numeric` function. + If the categories are numeric they can be converted using the + :func:`pd.to_numeric` function, or as appropriate, another converter + such as :func:`pd.to_datetime`. - .. ipython :: python + .. ipython:: python df = pd.read_csv(StringIO(data), dtype='category') df.dtypes diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 5ab2b6406ad6d..a04113e0b3dba 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -136,6 +136,11 @@ def test_passing_dtype(self): dtype={'A': 'timedelta64', 'B': 'float64'}, index_col=0) + # valid but unsupported - fixed width unicode string + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'U8'}, + index_col=0) + # see gh-12048: empty frame actual = self.read_csv(StringIO('A,B'), dtype=str) expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) @@ -255,6 +260,23 @@ def test_categorical_dtype_encoding(self): actual = actual.apply(lambda x: x.astype(object)) tm.assert_frame_equal(actual, expected) + def test_categorical_dtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'])}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'])})] + actuals = self.read_csv(StringIO(data), dtype={'b':'category'}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 3809c82654312..df1d8de0f99b7 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1173,7 +1173,7 @@ cdef class TextReader: elif dtype.kind == 'U': width = dtype.itemsize if width > 0: - raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not supported for parsing" % dtype) # unicode variable width return self._string_convert(i, start, end, na_filter, @@ -1187,10 +1187,10 @@ cdef class TextReader: elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) + elif is_datetime64_dtype(dtype): + raise TypeError("the dtype %s is not supported for parsing, " + "pass this column using parse_dates instead" % dtype) else: - if is_datetime64_dtype(dtype): - raise TypeError("the dtype %s is not supported for parsing, " - "pass this column using parse_dates instead" % dtype) raise TypeError("the dtype %s is not supported for parsing" % dtype) cdef _string_convert(self, Py_ssize_t i, int start, int end, From da5c5b575d5181255f15f422cb86c24dd30b9aa5 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 21 Jul 2016 19:55:05 -0500 Subject: [PATCH 07/11] flake8 fix --- pandas/io/tests/parser/c_parser_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index a04113e0b3dba..1d8381512a51c 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -271,7 +271,7 @@ def test_categorical_dtype_chunksize(self): 'b': Categorical(['a', 'b'])}), pd.DataFrame({'a': [1, 2], 'b': Categorical(['b', 'c'])})] - actuals = self.read_csv(StringIO(data), dtype={'b':'category'}, + actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, chunksize=2) for actual, expected in zip(actuals, expecteds): From 0f0dba63fb4d4aacb37ae4d839cf65eba1658f40 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 3 Aug 2016 20:06:52 -0500 Subject: [PATCH 08/11] wip --- doc/source/io.rst | 4 ++-- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/io/tests/parser/c_parser_only.py | 12 ++++++------ pandas/parser.pyx | 17 ++++++++++++++++- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 81fb2871df3c0..c3da848e86856 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -509,7 +509,7 @@ Specifying Categorical dtype ``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` -.. ipython :: python +.. ipython:: python data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' @@ -525,7 +525,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification .. note:: - The resulting categories will always be parsed as string (object dtype). + The resulting categories will always be parsed as strings (object dtype). If the categories are numeric they can be converted using the :func:`pd.to_numeric` function, or as appropriate, another converter such as :func:`pd.to_datetime`. diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index db3381d558f1e..f790993d224c4 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -258,7 +258,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification .. note:: - The resulting categories will always be parsed as string (object dtype). + The resulting categories will always be parsed as strings (object dtype). If the categories are numeric they can be converted using the :func:`pd.to_numeric` function, or as appropriate, another converter such as :func:`pd.to_datetime`. diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 1d8381512a51c..675904dff20c4 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -225,8 +225,7 @@ def test_categorical_dtype(self): 1,b,3.4 2,a,4.5""" expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical.from_codes([0, 0, 1], - ['b', 'a']), + 'b': Categorical(['b', 'b', 'a']), 'c': Categorical(['3.4', '3.4', '4.5'])}) actual = self.read_csv(StringIO(data), dtype='category') tm.assert_frame_equal(actual, expected) @@ -237,8 +236,7 @@ def test_categorical_dtype(self): 1,nan,3.4 2,a,4.5""" expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical.from_codes([0, -1, 1], - ['b', 'a']), + 'b': Categorical(['b', np.nan, 'a']), 'c': Categorical(['3.4', '3.4', '4.5'])}) actual = self.read_csv(StringIO(data), dtype='category') tm.assert_frame_equal(actual, expected) @@ -248,14 +246,15 @@ def test_categorical_dtype_encoding(self): pth = tm.get_data_path('unicode_series.csv') encoding = 'latin-1' expected = self.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) actual = self.read_csv(pth, header=None, encoding=encoding, dtype={1: 'category'}) - actual[1] = actual[1].astype(object) tm.assert_frame_equal(actual, expected) pth = tm.get_data_path('utf16_ex.txt') encoding = 'utf-16' expected = self.read_table(pth, encoding=encoding) + expected = expected.apply(Categorical) actual = self.read_table(pth, encoding=encoding, dtype='category') actual = actual.apply(lambda x: x.astype(object)) tm.assert_frame_equal(actual, expected) @@ -270,7 +269,8 @@ def test_categorical_dtype_chunksize(self): expecteds = [pd.DataFrame({'a': [1, 1], 'b': Categorical(['a', 'b'])}), pd.DataFrame({'a': [1, 2], - 'b': Categorical(['b', 'c'])})] + 'b': Categorical(['b', 'c'])}, + index=[2, 3])] actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, chunksize=2) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index df1d8de0f99b7..629d2f8b812e2 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -40,7 +40,9 @@ from pandas.types.common import (is_categorical_dtype, CategoricalDtype, is_string_dtype, is_datetime64_dtype, pandas_dtype) from pandas.core.categorical import Categorical +from pandas.core.algorithms import take_1d from pandas.types.concat import union_categoricals +from pandas import Index import time import os @@ -1182,6 +1184,19 @@ cdef class TextReader: codes, cats, na_count = _categorical_convert(self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) + print cats + print codes + # sort categories and recode if necessary + cats = Index(cats) + if not cats.is_monotonic_increasing: + unsorted = cats.copy() + cats = cats.sort_values() + indexer = unsorted.get_indexer(cats) + codes = take_1d(indexer, codes, fill_value=-1) + print indexer + print cats + print codes + return Categorical(codes, categories=cats, ordered=False, fastpath=True), na_count elif is_object_dtype(dtype): @@ -2000,7 +2015,7 @@ def _concatenate_chunks(list chunks): warning_columns.append(str(name)) if is_categorical_dtype(dtypes.pop()): - result[name] = union_categoricals(arrs) + result[name] = union_categoricals(arrs, sort_categories=True) else: result[name] = np.concatenate(arrs) From 1f6093a0784ed5c9ec926ce999e4dbdc3b239d4c Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 4 Aug 2016 18:17:11 -0500 Subject: [PATCH 09/11] rebase --- pandas/io/tests/parser/c_parser_only.py | 1 - pandas/parser.pyx | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 675904dff20c4..4cea9e1d6b595 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -256,7 +256,6 @@ def test_categorical_dtype_encoding(self): expected = self.read_table(pth, encoding=encoding) expected = expected.apply(Categorical) actual = self.read_table(pth, encoding=encoding, dtype='category') - actual = actual.apply(lambda x: x.astype(object)) tm.assert_frame_equal(actual, expected) def test_categorical_dtype_chunksize(self): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 629d2f8b812e2..5af82be5b741b 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1184,18 +1184,13 @@ cdef class TextReader: codes, cats, na_count = _categorical_convert(self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) - print cats - print codes # sort categories and recode if necessary cats = Index(cats) if not cats.is_monotonic_increasing: unsorted = cats.copy() cats = cats.sort_values() - indexer = unsorted.get_indexer(cats) + indexer = cats.get_indexer(unsorted) codes = take_1d(indexer, codes, fill_value=-1) - print indexer - print cats - print codes return Categorical(codes, categories=cats, ordered=False, fastpath=True), na_count From 75ed6ba0533e7e82db7342060d390194b4137723 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 4 Aug 2016 18:30:06 -0500 Subject: [PATCH 10/11] doc fixups --- doc/source/io.rst | 4 ++-- doc/source/whatsnew/v0.19.0.txt | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index c3da848e86856..7917e6b4cdfce 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -527,8 +527,8 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification The resulting categories will always be parsed as strings (object dtype). If the categories are numeric they can be converted using the - :func:`pd.to_numeric` function, or as appropriate, another converter - such as :func:`pd.to_datetime`. + :func:`to_numeric` function, or as appropriate, another converter + such as :func:`to_datetime`. .. ipython:: python diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f790993d224c4..6c995a6989a38 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -235,7 +235,7 @@ New behaviour: .. _whatsnew_0190.enhancements.read_csv_categorical: :func:`read_csv` supports parsing ``Categorical`` directly -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`read_csv` function now supports parsing a ``Categorical`` column when specified as a dtype (:issue:`10153`). Depending on the structure of the data, @@ -260,8 +260,8 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification The resulting categories will always be parsed as strings (object dtype). If the categories are numeric they can be converted using the - :func:`pd.to_numeric` function, or as appropriate, another converter - such as :func:`pd.to_datetime`. + :func:`to_numeric` function, or as appropriate, another converter + such as :func:`to_datetime`. .. ipython:: python From c78f39f982d1a3d788ff57a6a9707979a813e3a7 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 6 Aug 2016 07:56:48 -0500 Subject: [PATCH 11/11] rebase fixup --- pandas/types/concat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 40268c37db393..a7fd692cfb9cf 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -240,7 +240,6 @@ def union_categoricals(to_union, sort_categories=False): Emmpty list of categoricals passed """ from pandas import Index, Categorical - from pandas.core.algorithms import take_1d if len(to_union) == 0: raise ValueError('No Categoricals to union')