diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index f5b158d717357..3ff2406220e35 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -91,6 +91,7 @@ Bug Fixes - Fixed bug (:issue:`9542`) where labels did not appear properly in legend of ``DataFrame.plot()``. Passing ``label=`` args also now works, and series indices are no longer mutated. - Bug in json serialization when frame has length zero.(:issue:`9805`) +- Bug in `read_csv` where missing trailing delimiters would cause segfault. (:issue:`5664`) - Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`) diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index ad6f071d738ff..93d55c654de90 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -336,6 +336,28 @@ def test_empty_field_eof(self): 2: np.array(['3', ''], dtype=object)} assert_array_dicts_equal(result, expected) + # GH5664 + a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c']) + b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], + columns=list('abcd'), + index=[1, 1]) + c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan], + [8, 9, 10, 11], [13, 14, nan, nan]], + columns=list('abcd'), + index=[0, 5, 7, 12]) + + for _ in range(100): + df = read_csv(StringIO('a,b\nc\n'), skiprows=0, + names=['a'], engine='c') + assert_frame_equal(df, a) + + df = read_csv(StringIO('1,1,1,1,0\n'*2 + '\n'*2), + names=list("abcd"), engine='c') + assert_frame_equal(df, b) + + df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'), + names=list('abcd'), engine='c') + assert_frame_equal(df, c) def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index d13781d6fa132..73a03fc5cef7c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -175,7 +175,7 @@ cdef extern from "parser/tokenizer.h": int col void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) - char* COLITER_NEXT(coliter_t it) + void COLITER_NEXT(coliter_t, const char *) parser_t* parser_new() @@ -212,7 +212,7 @@ cdef extern from "parser/tokenizer.h": inline int to_longlong(char *item, long long *p_value) # inline int to_longlong_thousands(char *item, long long *p_value, # char tsep) - int to_boolean(char *item, uint8_t *val) + int to_boolean(const char *item, uint8_t *val) cdef extern from "parser/io.h": @@ -1279,7 +1279,7 @@ cdef _string_box_factorize(parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1296,7 +1296,7 @@ cdef _string_box_factorize(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1333,7 +1333,7 @@ cdef _string_box_utf8(parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1350,7 +1350,7 @@ cdef _string_box_utf8(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1388,7 +1388,7 @@ cdef _string_box_decode(parser_t *parser, int col, Py_ssize_t i, size size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1407,7 +1407,7 @@ cdef _string_box_decode(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1444,7 +1444,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, int error Py_ssize_t i, j coliter_t it - char *word + const char *word = NULL char *data ndarray result @@ -1454,7 +1454,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, coliter_setup(&it, parser, col, line_start) for i in range(line_end - line_start): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) strncpy(data, word, width) data += width @@ -1469,7 +1469,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL char *p_end double *data double NA = na_values[np.float64] @@ -1485,7 +1485,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1509,7 +1509,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: @@ -1530,7 +1530,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL int64_t *data ndarray result @@ -1544,7 +1544,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table if k != na_hashset.n_buckets: @@ -1561,7 +1561,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, return None, None else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: @@ -1578,7 +1578,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL uint8_t *data ndarray result @@ -1592,7 +1592,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1608,7 +1608,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) error = to_boolean(word, data) if error != 0: @@ -1625,7 +1625,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL uint8_t *data ndarray result @@ -1639,7 +1639,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1667,7 +1667,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(true_hashset, word) if k != true_hashset.n_buckets: @@ -1688,33 +1688,6 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, return result.view(np.bool_), na_count -cdef _get_na_mask(parser_t *parser, int col, int line_start, int line_end, - kh_str_t *na_hashset): - cdef: - int error - Py_ssize_t i - size_t lines - coliter_t it - char *word - ndarray[uint8_t, cast=True] result - khiter_t k - - lines = line_end - line_start - result = np.empty(lines, dtype=np.bool_) - - coliter_setup(&it, parser, col, line_start) - for i in range(lines): - word = COLITER_NEXT(it) - - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: - result[i] = 1 - else: - result[i] = 0 - - return result - cdef kh_str_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: @@ -1897,7 +1870,7 @@ cdef _apply_converter(object f, parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL char *errors = "strict" ndarray[object] result object val @@ -1909,17 +1882,17 @@ cdef _apply_converter(object f, parser_t *parser, int col, if not PY3 and c_encoding == NULL: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyBytes_FromString(word) result[i] = f(val) elif ((PY3 and c_encoding == NULL) or c_encoding == b'utf-8'): for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyUnicode_FromString(word) result[i] = f(val) else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyUnicode_Decode(word, strlen(word), c_encoding, errors) result[i] = f(val) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 1bc4096658b29..1850aab50b55a 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -38,7 +38,7 @@ See LICENSE for the license * RESTORE_FINAL (2): * Put the file position at the next byte after the * data read from the file_buffer. -* +* #define RESTORE_NOT 0 #define RESTORE_INITIAL 1 #define RESTORE_FINAL 2 @@ -304,7 +304,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->stream_len, &self->stream_cap, nbytes * 2, sizeof(char), &status); - TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", + TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) if (status != 0) { @@ -334,7 +334,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->words_len, &self->words_cap, nbytes, sizeof(char*), &status); - TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", + TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", self->words_len, self->words_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; @@ -371,7 +371,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->lines + 1, &self->lines_cap, nbytes, sizeof(int), &status); - TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", + TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", self->lines + 1, self->lines_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; @@ -398,7 +398,7 @@ static int push_char(parser_t *self, char c) { /* TRACE(("pushing %c \n", c)) */ TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) if (self->stream_len >= self->stream_cap) { - TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", + TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) self->error_msg = (char*) malloc(64); sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); @@ -463,7 +463,6 @@ static void append_warning(parser_t *self, const char *msg) { static int end_line(parser_t *self) { int fields; - khiter_t k; /* for hash set detection */ int ex_fields = self->expected_fields; char *msg; @@ -483,7 +482,7 @@ static int end_line(parser_t *self) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count self->file_lines++; - + // skip the tokens from this bad line self->line_start[self->lines] += fields; @@ -605,12 +604,11 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { static int parser_buffer_bytes(parser_t *self, size_t nbytes) { int status; size_t bytes_read; - void *src = self->source; status = 0; self->datapos = 0; self->data = self->cb_io(self->source, nbytes, &bytes_read, &status); - TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", + TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", nbytes, bytes_read, status)); self->datalen = bytes_read; @@ -704,7 +702,7 @@ typedef int (*parser_op)(parser_t *self, size_t line_limit); int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { - return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != + return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != ((kh_int64_t*)self->skipset)->n_buckets ); } else { @@ -784,7 +782,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit) else self->state = EAT_CRNL; break; - } + } else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; break; @@ -1750,7 +1748,7 @@ int parser_trim_buffers(parser_t *self) { /* trim stream */ new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", new_cap, self->stream_cap, self->lines_cap)); if (new_cap < self->stream_cap) { TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); @@ -1871,7 +1869,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { } } - TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", + TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", self->datalen - self->datapos, self->datalen, self->datapos)); /* TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status)); */ @@ -2033,7 +2031,7 @@ int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep) return status; }*/ -int to_boolean(char *item, uint8_t *val) { +int to_boolean(const char *item, uint8_t *val) { char *tmp; int i, status = 0; @@ -2357,7 +2355,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, num_digits++; num_decimals++; } - + if (num_digits >= max_digits) // consume extra decimal digits while (isdigit(*p)) ++p; @@ -2653,4 +2651,4 @@ uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error) *error = 0; return number; } -*/ \ No newline at end of file +*/ diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 694a73ec78153..d3777e858b6ca 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -228,9 +228,12 @@ coliter_t *coliter_new(parser_t *self, int i); /* #define COLITER_NEXT(iter) iter->words[iter->line_start[iter->line++] + iter->col] */ // #define COLITER_NEXT(iter) iter.words[iter.line_start[iter.line++] + iter.col] -#define COLITER_NEXT(iter) iter.words[*iter.line_start++ + iter.col] +#define COLITER_NEXT(iter, word) do { \ + const int i = *iter.line_start++ + iter.col; \ + word = i < *iter.line_start ? iter.words[i]: ""; \ + } while(0) -parser_t* parser_new(); +parser_t* parser_new(void); int parser_init(parser_t *self); @@ -270,6 +273,6 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, in //int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); int P_INLINE to_longlong(char *item, long long *p_value); //int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep); -int to_boolean(char *item, uint8_t *val); +int to_boolean(const char *item, uint8_t *val); #endif // _PARSER_COMMON_H_