Skip to content

Commit abfb27c

Browse files
committed
memory access bug in read_csv causing segfault
1 parent 5dff7df commit abfb27c

File tree

4 files changed

+68
-72
lines changed

4 files changed

+68
-72
lines changed

pandas/io/tests/test_cparser.py

+22
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,28 @@ def test_empty_field_eof(self):
336336
2: np.array(['3', ''], dtype=object)}
337337
assert_array_dicts_equal(result, expected)
338338

339+
# GH5664
340+
a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
341+
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
342+
columns=list('abcd'),
343+
index=[1, 1])
344+
c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
345+
[8, 9, 10, 11], [13, 14, nan, nan]],
346+
columns=list('abcd'),
347+
index=[0, 5, 7, 12])
348+
349+
for _ in range(100):
350+
df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
351+
names=['a'], engine='c')
352+
assert_frame_equal(df, a)
353+
354+
df = read_csv(StringIO('1,1,1,1,0\n'*2 + '\n'*2),
355+
names=list("abcd"), engine='c')
356+
assert_frame_equal(df, b)
357+
358+
df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
359+
names=list('abcd'))
360+
assert_frame_equal(df, c)
339361

340362
def assert_array_dicts_equal(left, right):
341363
for k, v in compat.iteritems(left):

pandas/parser.pyx

+26-53
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ cdef extern from "parser/tokenizer.h":
175175
int col
176176

177177
void coliter_setup(coliter_t *it, parser_t *parser, int i, int start)
178-
char* COLITER_NEXT(coliter_t it)
178+
void COLITER_NEXT(coliter_t it, const char * word)
179179

180180
parser_t* parser_new()
181181

@@ -212,7 +212,7 @@ cdef extern from "parser/tokenizer.h":
212212
inline int to_longlong(char *item, long long *p_value)
213213
# inline int to_longlong_thousands(char *item, long long *p_value,
214214
# char tsep)
215-
int to_boolean(char *item, uint8_t *val)
215+
int to_boolean(const char *item, uint8_t *val)
216216

217217

218218
cdef extern from "parser/io.h":
@@ -1279,7 +1279,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
12791279
Py_ssize_t i
12801280
size_t lines
12811281
coliter_t it
1282-
char *word
1282+
const char * word = NULL
12831283
ndarray[object] result
12841284

12851285
int ret = 0
@@ -1296,7 +1296,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
12961296
coliter_setup(&it, parser, col, line_start)
12971297

12981298
for i in range(lines):
1299-
word = COLITER_NEXT(it)
1299+
COLITER_NEXT(it, word)
13001300

13011301
if na_filter:
13021302
k = kh_get_str(na_hashset, word)
@@ -1333,7 +1333,7 @@ cdef _string_box_utf8(parser_t *parser, int col,
13331333
Py_ssize_t i
13341334
size_t lines
13351335
coliter_t it
1336-
char *word
1336+
const char * word = NULL
13371337
ndarray[object] result
13381338

13391339
int ret = 0
@@ -1350,7 +1350,7 @@ cdef _string_box_utf8(parser_t *parser, int col,
13501350
coliter_setup(&it, parser, col, line_start)
13511351

13521352
for i in range(lines):
1353-
word = COLITER_NEXT(it)
1353+
COLITER_NEXT(it, word)
13541354

13551355
if na_filter:
13561356
k = kh_get_str(na_hashset, word)
@@ -1388,7 +1388,7 @@ cdef _string_box_decode(parser_t *parser, int col,
13881388
Py_ssize_t i, size
13891389
size_t lines
13901390
coliter_t it
1391-
char *word
1391+
const char * word = NULL
13921392
ndarray[object] result
13931393

13941394
int ret = 0
@@ -1407,7 +1407,7 @@ cdef _string_box_decode(parser_t *parser, int col,
14071407
coliter_setup(&it, parser, col, line_start)
14081408

14091409
for i in range(lines):
1410-
word = COLITER_NEXT(it)
1410+
COLITER_NEXT(it, word)
14111411

14121412
if na_filter:
14131413
k = kh_get_str(na_hashset, word)
@@ -1444,7 +1444,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
14441444
int error
14451445
Py_ssize_t i, j
14461446
coliter_t it
1447-
char *word
1447+
const char * word = NULL
14481448
char *data
14491449
ndarray result
14501450

@@ -1454,7 +1454,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
14541454
coliter_setup(&it, parser, col, line_start)
14551455

14561456
for i in range(line_end - line_start):
1457-
word = COLITER_NEXT(it)
1457+
COLITER_NEXT(it, word)
14581458
strncpy(data, word, width)
14591459
data += width
14601460

@@ -1469,7 +1469,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
14691469
int error, na_count = 0
14701470
size_t i, lines
14711471
coliter_t it
1472-
char *word
1472+
const char * word = NULL
14731473
char *p_end
14741474
double *data
14751475
double NA = na_values[np.float64]
@@ -1485,7 +1485,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
14851485

14861486
if na_filter:
14871487
for i in range(lines):
1488-
word = COLITER_NEXT(it)
1488+
COLITER_NEXT(it, word)
14891489

14901490
k = kh_get_str(na_hashset, word)
14911491
# in the hash table
@@ -1509,7 +1509,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
15091509
data += 1
15101510
else:
15111511
for i in range(lines):
1512-
word = COLITER_NEXT(it)
1512+
COLITER_NEXT(it, word)
15131513
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
15141514
parser.thousands, 1)
15151515
if errno != 0 or p_end[0] or p_end == word:
@@ -1530,7 +1530,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
15301530
int error, na_count = 0
15311531
size_t i, lines
15321532
coliter_t it
1533-
char *word
1533+
const char * word = NULL
15341534
int64_t *data
15351535
ndarray result
15361536

@@ -1544,7 +1544,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
15441544

15451545
if na_filter:
15461546
for i in range(lines):
1547-
word = COLITER_NEXT(it)
1547+
COLITER_NEXT(it, word)
15481548
k = kh_get_str(na_hashset, word)
15491549
# in the hash table
15501550
if k != na_hashset.n_buckets:
@@ -1561,7 +1561,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
15611561
return None, None
15621562
else:
15631563
for i in range(lines):
1564-
word = COLITER_NEXT(it)
1564+
COLITER_NEXT(it, word)
15651565
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
15661566
&error, parser.thousands)
15671567
if error != 0:
@@ -1578,7 +1578,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
15781578
int error, na_count = 0
15791579
size_t i, lines
15801580
coliter_t it
1581-
char *word
1581+
const char * word = NULL
15821582
uint8_t *data
15831583
ndarray result
15841584

@@ -1592,7 +1592,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
15921592

15931593
if na_filter:
15941594
for i in range(lines):
1595-
word = COLITER_NEXT(it)
1595+
COLITER_NEXT(it, word)
15961596

15971597
k = kh_get_str(na_hashset, word)
15981598
# in the hash table
@@ -1608,7 +1608,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
16081608
data += 1
16091609
else:
16101610
for i in range(lines):
1611-
word = COLITER_NEXT(it)
1611+
COLITER_NEXT(it, word)
16121612

16131613
error = to_boolean(word, data)
16141614
if error != 0:
@@ -1625,7 +1625,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
16251625
int error, na_count = 0
16261626
size_t i, lines
16271627
coliter_t it
1628-
char *word
1628+
const char * word = NULL
16291629
uint8_t *data
16301630
ndarray result
16311631

@@ -1639,7 +1639,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
16391639

16401640
if na_filter:
16411641
for i in range(lines):
1642-
word = COLITER_NEXT(it)
1642+
COLITER_NEXT(it, word)
16431643

16441644
k = kh_get_str(na_hashset, word)
16451645
# in the hash table
@@ -1667,7 +1667,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
16671667
data += 1
16681668
else:
16691669
for i in range(lines):
1670-
word = COLITER_NEXT(it)
1670+
COLITER_NEXT(it, word)
16711671

16721672
k = kh_get_str(true_hashset, word)
16731673
if k != true_hashset.n_buckets:
@@ -1688,33 +1688,6 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
16881688

16891689
return result.view(np.bool_), na_count
16901690

1691-
cdef _get_na_mask(parser_t *parser, int col, int line_start, int line_end,
1692-
kh_str_t *na_hashset):
1693-
cdef:
1694-
int error
1695-
Py_ssize_t i
1696-
size_t lines
1697-
coliter_t it
1698-
char *word
1699-
ndarray[uint8_t, cast=True] result
1700-
khiter_t k
1701-
1702-
lines = line_end - line_start
1703-
result = np.empty(lines, dtype=np.bool_)
1704-
1705-
coliter_setup(&it, parser, col, line_start)
1706-
for i in range(lines):
1707-
word = COLITER_NEXT(it)
1708-
1709-
k = kh_get_str(na_hashset, word)
1710-
# in the hash table
1711-
if k != na_hashset.n_buckets:
1712-
result[i] = 1
1713-
else:
1714-
result[i] = 0
1715-
1716-
return result
1717-
17181691
cdef kh_str_t* kset_from_list(list values) except NULL:
17191692
# caller takes responsibility for freeing the hash table
17201693
cdef:
@@ -1897,7 +1870,7 @@ cdef _apply_converter(object f, parser_t *parser, int col,
18971870
Py_ssize_t i
18981871
size_t lines
18991872
coliter_t it
1900-
char *word
1873+
const char * word = NULL
19011874
char *errors = "strict"
19021875
ndarray[object] result
19031876
object val
@@ -1909,17 +1882,17 @@ cdef _apply_converter(object f, parser_t *parser, int col,
19091882

19101883
if not PY3 and c_encoding == NULL:
19111884
for i in range(lines):
1912-
word = COLITER_NEXT(it)
1885+
COLITER_NEXT(it, word)
19131886
val = PyBytes_FromString(word)
19141887
result[i] = f(val)
19151888
elif ((PY3 and c_encoding == NULL) or c_encoding == b'utf-8'):
19161889
for i in range(lines):
1917-
word = COLITER_NEXT(it)
1890+
COLITER_NEXT(it, word)
19181891
val = PyUnicode_FromString(word)
19191892
result[i] = f(val)
19201893
else:
19211894
for i in range(lines):
1922-
word = COLITER_NEXT(it)
1895+
COLITER_NEXT(it, word)
19231896
val = PyUnicode_Decode(word, strlen(word),
19241897
c_encoding, errors)
19251898
result[i] = f(val)

0 commit comments

Comments
 (0)