-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Use size_t to avoid array index overflow; add missing malloc of error_msg #17040
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
d5c75e8
e04d12a
1f24847
669d99b
0985cf3
3171674
e4dfd19
2930eaa
2ab4971
e3cb9c1
7b1cd8d
4380c53
a5d5677
6a1ba23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h": | |
io_callback cb_io | ||
io_cleanup cb_cleanup | ||
|
||
int chunksize # Number of bytes to prepare for each chunk | ||
size_t chunksize # Number of bytes to prepare for each chunk | ||
char *data # pointer to data to be processed | ||
int datalen # amount of data available | ||
int datapos | ||
size_t datalen # amount of data available | ||
size_t datapos | ||
|
||
# where to write out tokenized data | ||
char *stream | ||
int stream_len | ||
int stream_cap | ||
size_t stream_len | ||
size_t stream_cap | ||
|
||
# Store words in (potentially ragged) matrix for now, hmm | ||
char **words | ||
int *word_starts # where we are in the stream | ||
int words_len | ||
int words_cap | ||
size_t *word_starts # where we are in the stream | ||
size_t words_len | ||
size_t words_cap | ||
|
||
char *pword_start # pointer to stream start of current field | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modify the spacing so that it's realigned with the hashtags below. |
||
int word_start # position start of current field | ||
size_t word_start # position start of current field | ||
|
||
int *line_start # position in words for start of line | ||
int *line_fields # Number of fields in each line | ||
int lines # Number of lines observed | ||
int file_lines # Number of file lines observed (with bad/skipped) | ||
int lines_cap # Vector capacity | ||
size_t *line_start # position in words for start of line | ||
size_t *line_fields # Number of fields in each line | ||
size_t lines # Number of lines observed | ||
size_t file_lines # Number of file lines observed (with bad/skipped) | ||
size_t lines_cap # Vector capacity | ||
|
||
# Tokenizing stuff | ||
ParserState state | ||
|
@@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h": | |
char thousands | ||
|
||
int header # Boolean: 1: has header, 0: no header | ||
int header_start # header row start | ||
int header_end # header row end | ||
ssize_t header_start # header row start | ||
ssize_t header_end # header row end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modify the spacing so that it aligns with the hashtag above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In some specific places, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
void *skipset | ||
PyObject *skipfunc | ||
int64_t skip_first_N_rows | ||
int skipfooter | ||
size_t skipfooter | ||
# pick one, depending on whether the converter requires GIL | ||
double (*double_converter_nogil)(const char *, char **, | ||
char, char, char, int) nogil | ||
|
@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h": | |
char *warn_msg | ||
char *error_msg | ||
|
||
int skip_empty_lines | ||
size_t skip_empty_lines | ||
|
||
ctypedef struct coliter_t: | ||
char **words | ||
int *line_start | ||
int col | ||
size_t *line_start | ||
size_t col | ||
|
||
ctypedef struct uint_state: | ||
int seen_sint | ||
|
@@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h": | |
void uint_state_init(uint_state *self) | ||
int uint64_conflict(uint_state *self) | ||
|
||
void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil | ||
void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil | ||
void COLITER_NEXT(coliter_t, const char *) nogil | ||
|
||
parser_t* parser_new() | ||
|
@@ -289,14 +289,14 @@ cdef class TextReader: | |
object true_values, false_values | ||
object handle | ||
bint na_filter, verbose, has_usecols, has_mi_columns | ||
int parser_start | ||
size_t parser_start | ||
list clocks | ||
char *c_encoding | ||
kh_str_t *false_set | ||
kh_str_t *true_set | ||
|
||
cdef public: | ||
int leading_cols, table_width, skipfooter, buffer_lines | ||
size_t leading_cols, table_width, skipfooter, buffer_lines | ||
object allow_leading_cols | ||
object delimiter, converters, delim_whitespace | ||
object na_values | ||
|
@@ -730,7 +730,8 @@ cdef class TextReader: | |
Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa | ||
char *word | ||
object name | ||
int status, hr, data_line | ||
int status | ||
size_t hr, data_line | ||
char *errors = "strict" | ||
cdef StringPath path = _string_path(self.c_encoding) | ||
|
||
|
@@ -949,8 +950,8 @@ cdef class TextReader: | |
|
||
cdef _read_rows(self, rows, bint trim): | ||
cdef: | ||
int buffered_lines | ||
int irows, footer = 0 | ||
size_t buffered_lines | ||
size_t irows, footer = 0 | ||
|
||
self._start_clock() | ||
|
||
|
@@ -1018,12 +1019,13 @@ cdef class TextReader: | |
|
||
def _convert_column_data(self, rows=None, upcast_na=False, footer=0): | ||
cdef: | ||
Py_ssize_t i, nused | ||
size_t i | ||
int nused | ||
kh_str_t *na_hashset = NULL | ||
int start, end | ||
size_t start, end | ||
object name, na_flist, col_dtype = None | ||
bint na_filter = 0 | ||
Py_ssize_t num_cols | ||
size_t num_cols | ||
|
||
start = self.parser_start | ||
|
||
|
@@ -1036,7 +1038,7 @@ cdef class TextReader: | |
# if footer > 0: | ||
# end -= footer | ||
|
||
num_cols = -1 | ||
num_cols = 0 | ||
for i in range(self.parser.lines): | ||
num_cols = (num_cols < self.parser.line_fields[i]) * \ | ||
self.parser.line_fields[i] + \ | ||
|
@@ -1195,7 +1197,7 @@ cdef class TextReader: | |
return col_res, na_count | ||
|
||
cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, | ||
int start, int end, | ||
size_t start, size_t end, | ||
bint na_filter, | ||
bint user_dtype, | ||
kh_str_t *na_hashset, | ||
|
@@ -1275,7 +1277,7 @@ cdef class TextReader: | |
raise TypeError("the dtype %s is not " | ||
"supported for parsing" % dtype) | ||
|
||
cdef _string_convert(self, Py_ssize_t i, int start, int end, | ||
cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
|
||
cdef StringPath path = _string_path(self.c_encoding) | ||
|
@@ -1336,6 +1338,7 @@ cdef class TextReader: | |
kh_destroy_str(table) | ||
|
||
cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): | ||
cdef int j | ||
if self.has_usecols and self.names is not None: | ||
if (not callable(self.usecols) and | ||
len(self.names) == len(self.usecols)): | ||
|
@@ -1427,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding): | |
# ---------------------------------------------------------------------- | ||
# Type conversions / inference support code | ||
|
||
cdef _string_box_factorize(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _string_box_factorize(parser_t *parser, size_t col, | ||
size_t line_start, size_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int error, na_count = 0 | ||
|
@@ -1480,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, int col, | |
|
||
return result, na_count | ||
|
||
cdef _string_box_utf8(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _string_box_utf8(parser_t *parser, size_t col, | ||
size_t line_start, size_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int error, na_count = 0 | ||
|
@@ -1533,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, int col, | |
|
||
return result, na_count | ||
|
||
cdef _string_box_decode(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _string_box_decode(parser_t *parser, size_t col, | ||
size_t line_start, size_t line_end, | ||
bint na_filter, kh_str_t *na_hashset, | ||
char *encoding): | ||
cdef: | ||
|
@@ -1592,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, int col, | |
|
||
|
||
@cython.boundscheck(False) | ||
cdef _categorical_convert(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _categorical_convert(parser_t *parser, size_t col, | ||
size_t line_start, size_t line_end, | ||
bint na_filter, kh_str_t *na_hashset, | ||
char *encoding): | ||
"Convert column data into codes, categories" | ||
|
@@ -1663,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, int col, | |
kh_destroy_str(table) | ||
return np.asarray(codes), result, na_count | ||
|
||
cdef _to_fw_string(parser_t *parser, int col, int line_start, | ||
int line_end, size_t width): | ||
cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start, | ||
size_t line_end, size_t width): | ||
cdef: | ||
Py_ssize_t i | ||
coliter_t it | ||
|
@@ -1680,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, | |
|
||
return result | ||
|
||
cdef inline void _to_fw_string_nogil(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col, | ||
size_t line_start, size_t line_end, | ||
size_t width, char *data) nogil: | ||
cdef: | ||
Py_ssize_t i | ||
size_t i | ||
coliter_t it | ||
const char *word = NULL | ||
|
||
|
@@ -1699,7 +1702,7 @@ cdef char* cinf = b'inf' | |
cdef char* cposinf = b'+inf' | ||
cdef char* cneginf = b'-inf' | ||
|
||
cdef _try_double(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end, | ||
bint na_filter, kh_str_t *na_hashset, object na_flist): | ||
cdef: | ||
int error, na_count = 0 | ||
|
@@ -1808,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser, | |
|
||
return 0 | ||
|
||
cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int error | ||
|
@@ -1842,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, | |
|
||
return result | ||
|
||
cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start, | ||
size_t line_end, bint na_filter, | ||
const kh_str_t *na_hashset, | ||
uint64_t *data, uint_state *state) nogil: | ||
cdef: | ||
|
@@ -1879,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, | |
|
||
return 0 | ||
|
||
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int error, na_count = 0 | ||
|
@@ -1906,8 +1909,8 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, | |
|
||
return result, na_count | ||
|
||
cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start, | ||
size_t line_end, bint na_filter, | ||
const kh_str_t *na_hashset, int64_t NA, | ||
int64_t *data, int *na_count) nogil: | ||
cdef: | ||
|
@@ -1944,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, | |
|
||
return 0 | ||
|
||
cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int na_count | ||
|
@@ -1966,8 +1969,8 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, | |
return None, None | ||
return result.view(np.bool_), na_count | ||
|
||
cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start, | ||
size_t line_end, bint na_filter, | ||
const kh_str_t *na_hashset, uint8_t NA, | ||
uint8_t *data, int *na_count) nogil: | ||
cdef: | ||
|
@@ -2006,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, | |
data += 1 | ||
return 0 | ||
|
||
cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end, | ||
bint na_filter, const kh_str_t *na_hashset, | ||
const kh_str_t *true_hashset, | ||
const kh_str_t *false_hashset): | ||
|
@@ -2032,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, | |
return None, None | ||
return result.view(np.bool_), na_count | ||
|
||
cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start, | ||
size_t line_end, bint na_filter, | ||
const kh_str_t *na_hashset, | ||
const kh_str_t *true_hashset, | ||
const kh_str_t *false_hashset, | ||
|
@@ -2251,8 +2254,8 @@ for k in list(na_values): | |
na_values[np.dtype(k)] = na_values[k] | ||
|
||
|
||
cdef _apply_converter(object f, parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _apply_converter(object f, parser_t *parser, size_t col, | ||
size_t line_start, size_t line_end, | ||
char* c_encoding): | ||
cdef: | ||
int error | ||
|
@@ -2296,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols): | |
|
||
object name, fnames, field_type | ||
Py_ssize_t i, offset, nfields, length | ||
int stride, elsize | ||
size_t stride, elsize | ||
char *buf | ||
|
||
if names is None: | ||
|
@@ -2344,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols): | |
|
||
return recs | ||
|
||
cdef _fill_structured_column(char *dst, char* src, int elsize, | ||
int stride, int length, bint incref): | ||
cdef _fill_structured_column(char *dst, char* src, size_t elsize, | ||
size_t stride, size_t length, bint incref): | ||
cdef: | ||
Py_ssize_t i | ||
size_t i | ||
|
||
if incref: | ||
util.transfer_object_column(dst, src, stride, length) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix the spacing so that the hashtags line up like before.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I could do that and keep in under 80 columns, I would have
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why can't you move the "pointer to data to be processed" one further right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops! Sorry, was looking at wrong file! Will fix in next push.