Skip to content

Commit d5c75e8

Browse files
committed
BUG: Use size_t to avoid array index overflow; add missing malloc of error_msg
Fix a few locations where a parser's `error_msg` buffer is written to without having been previously allocated. This manifested as a double free during exception handling code making use of the `error_msg`. Aditionally, use `size_t/ssize_t` where array indicies or lengths will be stored. Previously, int32_t was used and would overflow on columns with very large amounts of data (i.e. greater than INTMAX bytes).
1 parent 031d7a9 commit d5c75e8

File tree

3 files changed

+166
-139
lines changed

3 files changed

+166
-139
lines changed

pandas/_libs/parsers.pyx

+67-64
Original file line numberDiff line numberDiff line change
@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":
121121
io_callback cb_io
122122
io_cleanup cb_cleanup
123123

124-
int chunksize # Number of bytes to prepare for each chunk
124+
size_t chunksize # Number of bytes to prepare for each chunk
125125
char *data # pointer to data to be processed
126-
int datalen # amount of data available
127-
int datapos
126+
size_t datalen # amount of data available
127+
size_t datapos
128128

129129
# where to write out tokenized data
130130
char *stream
131-
int stream_len
132-
int stream_cap
131+
size_t stream_len
132+
size_t stream_cap
133133

134134
# Store words in (potentially ragged) matrix for now, hmm
135135
char **words
136-
int *word_starts # where we are in the stream
137-
int words_len
138-
int words_cap
136+
size_t *word_starts # where we are in the stream
137+
size_t words_len
138+
size_t words_cap
139139

140140
char *pword_start # pointer to stream start of current field
141-
int word_start # position start of current field
141+
size_t word_start # position start of current field
142142

143-
int *line_start # position in words for start of line
144-
int *line_fields # Number of fields in each line
145-
int lines # Number of lines observed
146-
int file_lines # Number of file lines observed (with bad/skipped)
147-
int lines_cap # Vector capacity
143+
size_t *line_start # position in words for start of line
144+
size_t *line_fields # Number of fields in each line
145+
size_t lines # Number of lines observed
146+
size_t file_lines # Number of file lines observed (with bad/skipped)
147+
size_t lines_cap # Vector capacity
148148

149149
# Tokenizing stuff
150150
ParserState state
@@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h":
178178
char thousands
179179

180180
int header # Boolean: 1: has header, 0: no header
181-
int header_start # header row start
182-
int header_end # header row end
181+
ssize_t header_start # header row start
182+
ssize_t header_end # header row end
183183

184184
void *skipset
185185
PyObject *skipfunc
186186
int64_t skip_first_N_rows
187-
int skipfooter
187+
size_t skipfooter
188188
# pick one, depending on whether the converter requires GIL
189189
double (*double_converter_nogil)(const char *, char **,
190190
char, char, char, int) nogil
@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":
195195
char *warn_msg
196196
char *error_msg
197197

198-
int skip_empty_lines
198+
size_t skip_empty_lines
199199

200200
ctypedef struct coliter_t:
201201
char **words
202-
int *line_start
203-
int col
202+
size_t *line_start
203+
size_t col
204204

205205
ctypedef struct uint_state:
206206
int seen_sint
@@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h":
210210
void uint_state_init(uint_state *self)
211211
int uint64_conflict(uint_state *self)
212212

213-
void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil
213+
void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil
214214
void COLITER_NEXT(coliter_t, const char *) nogil
215215

216216
parser_t* parser_new()
@@ -289,14 +289,14 @@ cdef class TextReader:
289289
object true_values, false_values
290290
object handle
291291
bint na_filter, verbose, has_usecols, has_mi_columns
292-
int parser_start
292+
size_t parser_start
293293
list clocks
294294
char *c_encoding
295295
kh_str_t *false_set
296296
kh_str_t *true_set
297297

298298
cdef public:
299-
int leading_cols, table_width, skipfooter, buffer_lines
299+
size_t leading_cols, table_width, skipfooter, buffer_lines
300300
object allow_leading_cols
301301
object delimiter, converters, delim_whitespace
302302
object na_values
@@ -730,7 +730,8 @@ cdef class TextReader:
730730
Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa
731731
char *word
732732
object name
733-
int status, hr, data_line
733+
int status
734+
size_t hr, data_line
734735
char *errors = "strict"
735736
cdef StringPath path = _string_path(self.c_encoding)
736737

@@ -949,8 +950,8 @@ cdef class TextReader:
949950

950951
cdef _read_rows(self, rows, bint trim):
951952
cdef:
952-
int buffered_lines
953-
int irows, footer = 0
953+
size_t buffered_lines
954+
size_t irows, footer = 0
954955

955956
self._start_clock()
956957

@@ -1018,12 +1019,13 @@ cdef class TextReader:
10181019

10191020
def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
10201021
cdef:
1021-
Py_ssize_t i, nused
1022+
size_t i
1023+
int nused
10221024
kh_str_t *na_hashset = NULL
1023-
int start, end
1025+
size_t start, end
10241026
object name, na_flist, col_dtype = None
10251027
bint na_filter = 0
1026-
Py_ssize_t num_cols
1028+
size_t num_cols
10271029

10281030
start = self.parser_start
10291031

@@ -1036,7 +1038,7 @@ cdef class TextReader:
10361038
# if footer > 0:
10371039
# end -= footer
10381040

1039-
num_cols = -1
1041+
num_cols = 0
10401042
for i in range(self.parser.lines):
10411043
num_cols = (num_cols < self.parser.line_fields[i]) * \
10421044
self.parser.line_fields[i] + \
@@ -1195,7 +1197,7 @@ cdef class TextReader:
11951197
return col_res, na_count
11961198

11971199
cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
1198-
int start, int end,
1200+
size_t start, size_t end,
11991201
bint na_filter,
12001202
bint user_dtype,
12011203
kh_str_t *na_hashset,
@@ -1275,7 +1277,7 @@ cdef class TextReader:
12751277
raise TypeError("the dtype %s is not "
12761278
"supported for parsing" % dtype)
12771279

1278-
cdef _string_convert(self, Py_ssize_t i, int start, int end,
1280+
cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end,
12791281
bint na_filter, kh_str_t *na_hashset):
12801282

12811283
cdef StringPath path = _string_path(self.c_encoding)
@@ -1336,6 +1338,7 @@ cdef class TextReader:
13361338
kh_destroy_str(table)
13371339

13381340
cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
1341+
cdef int j
13391342
if self.has_usecols and self.names is not None:
13401343
if (not callable(self.usecols) and
13411344
len(self.names) == len(self.usecols)):
@@ -1427,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding):
14271430
# ----------------------------------------------------------------------
14281431
# Type conversions / inference support code
14291432

1430-
cdef _string_box_factorize(parser_t *parser, int col,
1431-
int line_start, int line_end,
1433+
cdef _string_box_factorize(parser_t *parser, size_t col,
1434+
size_t line_start, size_t line_end,
14321435
bint na_filter, kh_str_t *na_hashset):
14331436
cdef:
14341437
int error, na_count = 0
@@ -1480,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, int col,
14801483

14811484
return result, na_count
14821485

1483-
cdef _string_box_utf8(parser_t *parser, int col,
1484-
int line_start, int line_end,
1486+
cdef _string_box_utf8(parser_t *parser, size_t col,
1487+
size_t line_start, size_t line_end,
14851488
bint na_filter, kh_str_t *na_hashset):
14861489
cdef:
14871490
int error, na_count = 0
@@ -1533,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, int col,
15331536

15341537
return result, na_count
15351538

1536-
cdef _string_box_decode(parser_t *parser, int col,
1537-
int line_start, int line_end,
1539+
cdef _string_box_decode(parser_t *parser, size_t col,
1540+
size_t line_start, size_t line_end,
15381541
bint na_filter, kh_str_t *na_hashset,
15391542
char *encoding):
15401543
cdef:
@@ -1592,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, int col,
15921595

15931596

15941597
@cython.boundscheck(False)
1595-
cdef _categorical_convert(parser_t *parser, int col,
1596-
int line_start, int line_end,
1598+
cdef _categorical_convert(parser_t *parser, size_t col,
1599+
size_t line_start, size_t line_end,
15971600
bint na_filter, kh_str_t *na_hashset,
15981601
char *encoding):
15991602
"Convert column data into codes, categories"
@@ -1663,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, int col,
16631666
kh_destroy_str(table)
16641667
return np.asarray(codes), result, na_count
16651668

1666-
cdef _to_fw_string(parser_t *parser, int col, int line_start,
1667-
int line_end, size_t width):
1669+
cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start,
1670+
size_t line_end, size_t width):
16681671
cdef:
16691672
Py_ssize_t i
16701673
coliter_t it
@@ -1680,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
16801683

16811684
return result
16821685

1683-
cdef inline void _to_fw_string_nogil(parser_t *parser, int col,
1684-
int line_start, int line_end,
1686+
cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col,
1687+
size_t line_start, size_t line_end,
16851688
size_t width, char *data) nogil:
16861689
cdef:
1687-
Py_ssize_t i
1690+
size_t i
16881691
coliter_t it
16891692
const char *word = NULL
16901693

@@ -1699,7 +1702,7 @@ cdef char* cinf = b'inf'
16991702
cdef char* cposinf = b'+inf'
17001703
cdef char* cneginf = b'-inf'
17011704

1702-
cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
1705+
cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end,
17031706
bint na_filter, kh_str_t *na_hashset, object na_flist):
17041707
cdef:
17051708
int error, na_count = 0
@@ -1808,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser,
18081811

18091812
return 0
18101813

1811-
cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
1814+
cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
18121815
bint na_filter, kh_str_t *na_hashset):
18131816
cdef:
18141817
int error
@@ -1842,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
18421845

18431846
return result
18441847

1845-
cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
1846-
int line_end, bint na_filter,
1848+
cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start,
1849+
size_t line_end, bint na_filter,
18471850
const kh_str_t *na_hashset,
18481851
uint64_t *data, uint_state *state) nogil:
18491852
cdef:
@@ -1879,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
18791882

18801883
return 0
18811884

1882-
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
1885+
cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
18831886
bint na_filter, kh_str_t *na_hashset):
18841887
cdef:
18851888
int error, na_count = 0
@@ -1906,8 +1909,8 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
19061909

19071910
return result, na_count
19081911

1909-
cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
1910-
int line_end, bint na_filter,
1912+
cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start,
1913+
size_t line_end, bint na_filter,
19111914
const kh_str_t *na_hashset, int64_t NA,
19121915
int64_t *data, int *na_count) nogil:
19131916
cdef:
@@ -1944,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
19441947

19451948
return 0
19461949

1947-
cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
1950+
cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end,
19481951
bint na_filter, kh_str_t *na_hashset):
19491952
cdef:
19501953
int na_count
@@ -1966,8 +1969,8 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
19661969
return None, None
19671970
return result.view(np.bool_), na_count
19681971

1969-
cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
1970-
int line_end, bint na_filter,
1972+
cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start,
1973+
size_t line_end, bint na_filter,
19711974
const kh_str_t *na_hashset, uint8_t NA,
19721975
uint8_t *data, int *na_count) nogil:
19731976
cdef:
@@ -2006,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
20062009
data += 1
20072010
return 0
20082011

2009-
cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
2012+
cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end,
20102013
bint na_filter, const kh_str_t *na_hashset,
20112014
const kh_str_t *true_hashset,
20122015
const kh_str_t *false_hashset):
@@ -2032,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
20322035
return None, None
20332036
return result.view(np.bool_), na_count
20342037

2035-
cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
2036-
int line_end, bint na_filter,
2038+
cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start,
2039+
size_t line_end, bint na_filter,
20372040
const kh_str_t *na_hashset,
20382041
const kh_str_t *true_hashset,
20392042
const kh_str_t *false_hashset,
@@ -2251,8 +2254,8 @@ for k in list(na_values):
22512254
na_values[np.dtype(k)] = na_values[k]
22522255

22532256

2254-
cdef _apply_converter(object f, parser_t *parser, int col,
2255-
int line_start, int line_end,
2257+
cdef _apply_converter(object f, parser_t *parser, size_t col,
2258+
size_t line_start, size_t line_end,
22562259
char* c_encoding):
22572260
cdef:
22582261
int error
@@ -2296,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols):
22962299

22972300
object name, fnames, field_type
22982301
Py_ssize_t i, offset, nfields, length
2299-
int stride, elsize
2302+
size_t stride, elsize
23002303
char *buf
23012304

23022305
if names is None:
@@ -2344,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols):
23442347

23452348
return recs
23462349

2347-
cdef _fill_structured_column(char *dst, char* src, int elsize,
2348-
int stride, int length, bint incref):
2350+
cdef _fill_structured_column(char *dst, char* src, size_t elsize,
2351+
size_t stride, size_t length, bint incref):
23492352
cdef:
2350-
Py_ssize_t i
2353+
size_t i
23512354

23522355
if incref:
23532356
util.transfer_object_column(dst, src, stride, length)

0 commit comments

Comments
 (0)