Skip to content

BUG: memory access bug in read_csv causing segfault #9846

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 12, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Bug Fixes

- Fixed bug (:issue:`9542`) where labels did not appear properly in legend of ``DataFrame.plot()``. Passing ``label=`` args also now works, and series indices are no longer mutated.
- Bug in json serialization when frame has length zero.(:issue:`9805`)
- Bug in `read_csv` where missing trailing delimiters would cause segfault. (:issue:`5664`)


- Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`)
Expand Down
22 changes: 22 additions & 0 deletions pandas/io/tests/test_cparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,28 @@ def test_empty_field_eof(self):
2: np.array(['3', ''], dtype=object)}
assert_array_dicts_equal(result, expected)

# GH5664
a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
columns=list('abcd'),
index=[1, 1])
c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
[8, 9, 10, 11], [13, 14, nan, nan]],
columns=list('abcd'),
index=[0, 5, 7, 12])

for _ in range(100):
df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
names=['a'], engine='c')
assert_frame_equal(df, a)

df = read_csv(StringIO('1,1,1,1,0\n'*2 + '\n'*2),
names=list("abcd"), engine='c')
assert_frame_equal(df, b)

df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
names=list('abcd'), engine='c')
assert_frame_equal(df, c)

def assert_array_dicts_equal(left, right):
for k, v in compat.iteritems(left):
Expand Down
79 changes: 26 additions & 53 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ cdef extern from "parser/tokenizer.h":
int col

void coliter_setup(coliter_t *it, parser_t *parser, int i, int start)
char* COLITER_NEXT(coliter_t it)
void COLITER_NEXT(coliter_t, const char *)

parser_t* parser_new()

Expand Down Expand Up @@ -212,7 +212,7 @@ cdef extern from "parser/tokenizer.h":
inline int to_longlong(char *item, long long *p_value)
# inline int to_longlong_thousands(char *item, long long *p_value,
# char tsep)
int to_boolean(char *item, uint8_t *val)
int to_boolean(const char *item, uint8_t *val)


cdef extern from "parser/io.h":
Expand Down Expand Up @@ -1279,7 +1279,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
Py_ssize_t i
size_t lines
coliter_t it
char *word
const char *word = NULL
ndarray[object] result

int ret = 0
Expand All @@ -1296,7 +1296,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
coliter_setup(&it, parser, col, line_start)

for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)

if na_filter:
k = kh_get_str(na_hashset, word)
Expand Down Expand Up @@ -1333,7 +1333,7 @@ cdef _string_box_utf8(parser_t *parser, int col,
Py_ssize_t i
size_t lines
coliter_t it
char *word
const char *word = NULL
ndarray[object] result

int ret = 0
Expand All @@ -1350,7 +1350,7 @@ cdef _string_box_utf8(parser_t *parser, int col,
coliter_setup(&it, parser, col, line_start)

for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)

if na_filter:
k = kh_get_str(na_hashset, word)
Expand Down Expand Up @@ -1388,7 +1388,7 @@ cdef _string_box_decode(parser_t *parser, int col,
Py_ssize_t i, size
size_t lines
coliter_t it
char *word
const char *word = NULL
ndarray[object] result

int ret = 0
Expand All @@ -1407,7 +1407,7 @@ cdef _string_box_decode(parser_t *parser, int col,
coliter_setup(&it, parser, col, line_start)

for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)

if na_filter:
k = kh_get_str(na_hashset, word)
Expand Down Expand Up @@ -1444,7 +1444,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
int error
Py_ssize_t i, j
coliter_t it
char *word
const char *word = NULL
char *data
ndarray result

Expand All @@ -1454,7 +1454,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
coliter_setup(&it, parser, col, line_start)

for i in range(line_end - line_start):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)
strncpy(data, word, width)
data += width

Expand All @@ -1469,7 +1469,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
int error, na_count = 0
size_t i, lines
coliter_t it
char *word
const char *word = NULL
char *p_end
double *data
double NA = na_values[np.float64]
Expand All @@ -1485,7 +1485,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,

if na_filter:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)

k = kh_get_str(na_hashset, word)
# in the hash table
Expand All @@ -1509,7 +1509,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
data += 1
else:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
parser.thousands, 1)
if errno != 0 or p_end[0] or p_end == word:
Expand All @@ -1530,7 +1530,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
int error, na_count = 0
size_t i, lines
coliter_t it
char *word
const char *word = NULL
int64_t *data
ndarray result

Expand All @@ -1544,7 +1544,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,

if na_filter:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)
k = kh_get_str(na_hashset, word)
# in the hash table
if k != na_hashset.n_buckets:
Expand All @@ -1561,7 +1561,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
return None, None
else:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
&error, parser.thousands)
if error != 0:
Expand All @@ -1578,7 +1578,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
int error, na_count = 0
size_t i, lines
coliter_t it
char *word
const char *word = NULL
uint8_t *data
ndarray result

Expand All @@ -1592,7 +1592,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,

if na_filter:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)

k = kh_get_str(na_hashset, word)
# in the hash table
Expand All @@ -1608,7 +1608,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
data += 1
else:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)

error = to_boolean(word, data)
if error != 0:
Expand All @@ -1625,7 +1625,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
int error, na_count = 0
size_t i, lines
coliter_t it
char *word
const char *word = NULL
uint8_t *data
ndarray result

Expand All @@ -1639,7 +1639,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,

if na_filter:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)

k = kh_get_str(na_hashset, word)
# in the hash table
Expand Down Expand Up @@ -1667,7 +1667,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
data += 1
else:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)

k = kh_get_str(true_hashset, word)
if k != true_hashset.n_buckets:
Expand All @@ -1688,33 +1688,6 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,

return result.view(np.bool_), na_count

cdef _get_na_mask(parser_t *parser, int col, int line_start, int line_end,
kh_str_t *na_hashset):
cdef:
int error
Py_ssize_t i
size_t lines
coliter_t it
char *word
ndarray[uint8_t, cast=True] result
khiter_t k

lines = line_end - line_start
result = np.empty(lines, dtype=np.bool_)

coliter_setup(&it, parser, col, line_start)
for i in range(lines):
word = COLITER_NEXT(it)

k = kh_get_str(na_hashset, word)
# in the hash table
if k != na_hashset.n_buckets:
result[i] = 1
else:
result[i] = 0

return result

cdef kh_str_t* kset_from_list(list values) except NULL:
# caller takes responsibility for freeing the hash table
cdef:
Expand Down Expand Up @@ -1897,7 +1870,7 @@ cdef _apply_converter(object f, parser_t *parser, int col,
Py_ssize_t i
size_t lines
coliter_t it
char *word
const char *word = NULL
char *errors = "strict"
ndarray[object] result
object val
Expand All @@ -1909,17 +1882,17 @@ cdef _apply_converter(object f, parser_t *parser, int col,

if not PY3 and c_encoding == NULL:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)
val = PyBytes_FromString(word)
result[i] = f(val)
elif ((PY3 and c_encoding == NULL) or c_encoding == b'utf-8'):
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)
val = PyUnicode_FromString(word)
result[i] = f(val)
else:
for i in range(lines):
word = COLITER_NEXT(it)
COLITER_NEXT(it, word)
val = PyUnicode_Decode(word, strlen(word),
c_encoding, errors)
result[i] = f(val)
Expand Down
Loading