Skip to content

Commit e04d12a

Browse files
committed
Switch to use int64_t rather than size_t due to portability concerns.
1 parent d5c75e8 commit e04d12a

File tree

2 files changed

+85
-85
lines changed

2 files changed

+85
-85
lines changed

pandas/_libs/parsers.pyx

+67-67
Original file line numberDiff line numberDiff line change
@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":
121121
io_callback cb_io
122122
io_cleanup cb_cleanup
123123

124-
size_t chunksize # Number of bytes to prepare for each chunk
125-
char *data # pointer to data to be processed
126-
size_t datalen # amount of data available
127-
size_t datapos
124+
int64_t chunksize # Number of bytes to prepare for each chunk
125+
char *data # pointer to data to be processed
126+
int64_t datalen # amount of data available
127+
int64_t datapos
128128

129129
# where to write out tokenized data
130130
char *stream
131-
size_t stream_len
132-
size_t stream_cap
131+
int64_t stream_len
132+
int64_t stream_cap
133133

134134
# Store words in (potentially ragged) matrix for now, hmm
135135
char **words
136-
size_t *word_starts # where we are in the stream
137-
size_t words_len
138-
size_t words_cap
136+
int64_t *word_starts # where we are in the stream
137+
int64_t words_len
138+
int64_t words_cap
139139

140-
char *pword_start # pointer to stream start of current field
141-
size_t word_start # position start of current field
140+
char *pword_start # pointer to stream start of current field
141+
int64_t word_start # position start of current field
142142

143-
size_t *line_start # position in words for start of line
144-
size_t *line_fields # Number of fields in each line
145-
size_t lines # Number of lines observed
146-
size_t file_lines # Number of file lines observed (with bad/skipped)
147-
size_t lines_cap # Vector capacity
143+
int64_t *line_start # position in words for start of line
144+
int64_t *line_fields # Number of fields in each line
145+
int64_t lines # Number of lines observed
146+
int64_t file_lines # Number of file lines observed (with bad/skipped)
147+
int64_t lines_cap # Vector capacity
148148

149149
# Tokenizing stuff
150150
ParserState state
@@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h":
178178
char thousands
179179

180180
int header # Boolean: 1: has header, 0: no header
181-
ssize_t header_start # header row start
182-
ssize_t header_end # header row end
181+
int64_t header_start # header row start
182+
int64_t header_end # header row end
183183

184184
void *skipset
185185
PyObject *skipfunc
186186
int64_t skip_first_N_rows
187-
size_t skipfooter
187+
int64_t skipfooter
188188
# pick one, depending on whether the converter requires GIL
189189
double (*double_converter_nogil)(const char *, char **,
190190
char, char, char, int) nogil
@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":
195195
char *warn_msg
196196
char *error_msg
197197

198-
size_t skip_empty_lines
198+
int64_t skip_empty_lines
199199

200200
ctypedef struct coliter_t:
201201
char **words
202-
size_t *line_start
203-
size_t col
202+
int64_t *line_start
203+
int64_t col
204204

205205
ctypedef struct uint_state:
206206
int seen_sint
@@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h":
210210
void uint_state_init(uint_state *self)
211211
int uint64_conflict(uint_state *self)
212212

213-
void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil
213+
void coliter_setup(coliter_t *it, parser_t *parser, int64_t i, int64_t start) nogil
214214
void COLITER_NEXT(coliter_t, const char *) nogil
215215

216216
parser_t* parser_new()
@@ -289,14 +289,14 @@ cdef class TextReader:
289289
object true_values, false_values
290290
object handle
291291
bint na_filter, verbose, has_usecols, has_mi_columns
292-
size_t parser_start
292+
int64_t parser_start
293293
list clocks
294294
char *c_encoding
295295
kh_str_t *false_set
296296
kh_str_t *true_set
297297

298298
cdef public:
299-
size_t leading_cols, table_width, skipfooter, buffer_lines
299+
int64_t leading_cols, table_width, skipfooter, buffer_lines
300300
object allow_leading_cols
301301
object delimiter, converters, delim_whitespace
302302
object na_values
@@ -731,7 +731,7 @@ cdef class TextReader:
731731
char *word
732732
object name
733733
int status
734-
size_t hr, data_line
734+
int64_t hr, data_line
735735
char *errors = "strict"
736736
cdef StringPath path = _string_path(self.c_encoding)
737737

@@ -950,8 +950,8 @@ cdef class TextReader:
950950

951951
cdef _read_rows(self, rows, bint trim):
952952
cdef:
953-
size_t buffered_lines
954-
size_t irows, footer = 0
953+
int64_t buffered_lines
954+
int64_t irows, footer = 0
955955

956956
self._start_clock()
957957

@@ -1019,13 +1019,13 @@ cdef class TextReader:
10191019

10201020
def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
10211021
cdef:
1022-
size_t i
1022+
int64_t i
10231023
int nused
10241024
kh_str_t *na_hashset = NULL
1025-
size_t start, end
1025+
int64_t start, end
10261026
object name, na_flist, col_dtype = None
10271027
bint na_filter = 0
1028-
size_t num_cols
1028+
int64_t num_cols
10291029

10301030
start = self.parser_start
10311031

@@ -1038,7 +1038,7 @@ cdef class TextReader:
10381038
# if footer > 0:
10391039
# end -= footer
10401040

1041-
num_cols = 0
1041+
num_cols = -1
10421042
for i in range(self.parser.lines):
10431043
num_cols = (num_cols < self.parser.line_fields[i]) * \
10441044
self.parser.line_fields[i] + \
@@ -1197,7 +1197,7 @@ cdef class TextReader:
11971197
return col_res, na_count
11981198

11991199
cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
1200-
size_t start, size_t end,
1200+
int64_t start, int64_t end,
12011201
bint na_filter,
12021202
bint user_dtype,
12031203
kh_str_t *na_hashset,
@@ -1277,7 +1277,7 @@ cdef class TextReader:
12771277
raise TypeError("the dtype %s is not "
12781278
"supported for parsing" % dtype)
12791279

1280-
cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end,
1280+
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
12811281
bint na_filter, kh_str_t *na_hashset):
12821282

12831283
cdef StringPath path = _string_path(self.c_encoding)
@@ -1338,7 +1338,7 @@ cdef class TextReader:
13381338
kh_destroy_str(table)
13391339

13401340
cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
1341-
cdef int j
1341+
cdef int64_t j
13421342
if self.has_usecols and self.names is not None:
13431343
if (not callable(self.usecols) and
13441344
len(self.names) == len(self.usecols)):
@@ -1430,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding):
14301430
# ----------------------------------------------------------------------
14311431
# Type conversions / inference support code
14321432

1433-
cdef _string_box_factorize(parser_t *parser, size_t col,
1434-
size_t line_start, size_t line_end,
1433+
cdef _string_box_factorize(parser_t *parser, int64_t col,
1434+
int64_t line_start, int64_t line_end,
14351435
bint na_filter, kh_str_t *na_hashset):
14361436
cdef:
14371437
int error, na_count = 0
@@ -1483,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, size_t col,
14831483

14841484
return result, na_count
14851485

1486-
cdef _string_box_utf8(parser_t *parser, size_t col,
1487-
size_t line_start, size_t line_end,
1486+
cdef _string_box_utf8(parser_t *parser, int64_t col,
1487+
int64_t line_start, int64_t line_end,
14881488
bint na_filter, kh_str_t *na_hashset):
14891489
cdef:
14901490
int error, na_count = 0
@@ -1536,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, size_t col,
15361536

15371537
return result, na_count
15381538

1539-
cdef _string_box_decode(parser_t *parser, size_t col,
1540-
size_t line_start, size_t line_end,
1539+
cdef _string_box_decode(parser_t *parser, int64_t col,
1540+
int64_t line_start, int64_t line_end,
15411541
bint na_filter, kh_str_t *na_hashset,
15421542
char *encoding):
15431543
cdef:
@@ -1595,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, size_t col,
15951595

15961596

15971597
@cython.boundscheck(False)
1598-
cdef _categorical_convert(parser_t *parser, size_t col,
1599-
size_t line_start, size_t line_end,
1598+
cdef _categorical_convert(parser_t *parser, int64_t col,
1599+
int64_t line_start, int64_t line_end,
16001600
bint na_filter, kh_str_t *na_hashset,
16011601
char *encoding):
16021602
"Convert column data into codes, categories"
@@ -1666,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, size_t col,
16661666
kh_destroy_str(table)
16671667
return np.asarray(codes), result, na_count
16681668

1669-
cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start,
1670-
size_t line_end, size_t width):
1669+
cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
1670+
int64_t line_end, int64_t width):
16711671
cdef:
16721672
Py_ssize_t i
16731673
coliter_t it
@@ -1683,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start,
16831683

16841684
return result
16851685

1686-
cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col,
1687-
size_t line_start, size_t line_end,
1686+
cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
1687+
int64_t line_start, int64_t line_end,
16881688
size_t width, char *data) nogil:
16891689
cdef:
1690-
size_t i
1690+
int64_t i
16911691
coliter_t it
16921692
const char *word = NULL
16931693

@@ -1702,7 +1702,7 @@ cdef char* cinf = b'inf'
17021702
cdef char* cposinf = b'+inf'
17031703
cdef char* cneginf = b'-inf'
17041704

1705-
cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end,
1705+
cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
17061706
bint na_filter, kh_str_t *na_hashset, object na_flist):
17071707
cdef:
17081708
int error, na_count = 0
@@ -1811,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser,
18111811

18121812
return 0
18131813

1814-
cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
1814+
cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
18151815
bint na_filter, kh_str_t *na_hashset):
18161816
cdef:
18171817
int error
@@ -1845,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_en
18451845

18461846
return result
18471847

1848-
cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start,
1849-
size_t line_end, bint na_filter,
1848+
cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start,
1849+
int64_t line_end, bint na_filter,
18501850
const kh_str_t *na_hashset,
18511851
uint64_t *data, uint_state *state) nogil:
18521852
cdef:
@@ -1882,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_star
18821882

18831883
return 0
18841884

1885-
cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
1885+
cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
18861886
bint na_filter, kh_str_t *na_hashset):
18871887
cdef:
18881888
int error, na_count = 0
@@ -1909,8 +1909,8 @@ cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end
19091909

19101910
return result, na_count
19111911

1912-
cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start,
1913-
size_t line_end, bint na_filter,
1912+
cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start,
1913+
int64_t line_end, bint na_filter,
19141914
const kh_str_t *na_hashset, int64_t NA,
19151915
int64_t *data, int *na_count) nogil:
19161916
cdef:
@@ -1947,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start
19471947

19481948
return 0
19491949

1950-
cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end,
1950+
cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
19511951
bint na_filter, kh_str_t *na_hashset):
19521952
cdef:
19531953
int na_count
@@ -1969,8 +1969,8 @@ cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end,
19691969
return None, None
19701970
return result.view(np.bool_), na_count
19711971

1972-
cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start,
1973-
size_t line_end, bint na_filter,
1972+
cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_start,
1973+
int64_t line_end, bint na_filter,
19741974
const kh_str_t *na_hashset, uint8_t NA,
19751975
uint8_t *data, int *na_count) nogil:
19761976
cdef:
@@ -2009,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start,
20092009
data += 1
20102010
return 0
20112011

2012-
cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end,
2012+
cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
20132013
bint na_filter, const kh_str_t *na_hashset,
20142014
const kh_str_t *true_hashset,
20152015
const kh_str_t *false_hashset):
@@ -2035,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line
20352035
return None, None
20362036
return result.view(np.bool_), na_count
20372037

2038-
cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start,
2039-
size_t line_end, bint na_filter,
2038+
cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start,
2039+
int64_t line_end, bint na_filter,
20402040
const kh_str_t *na_hashset,
20412041
const kh_str_t *true_hashset,
20422042
const kh_str_t *false_hashset,
@@ -2254,8 +2254,8 @@ for k in list(na_values):
22542254
na_values[np.dtype(k)] = na_values[k]
22552255

22562256

2257-
cdef _apply_converter(object f, parser_t *parser, size_t col,
2258-
size_t line_start, size_t line_end,
2257+
cdef _apply_converter(object f, parser_t *parser, int64_t col,
2258+
int64_t line_start, int64_t line_end,
22592259
char* c_encoding):
22602260
cdef:
22612261
int error
@@ -2299,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols):
22992299

23002300
object name, fnames, field_type
23012301
Py_ssize_t i, offset, nfields, length
2302-
size_t stride, elsize
2302+
int64_t stride, elsize
23032303
char *buf
23042304

23052305
if names is None:
@@ -2347,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols):
23472347

23482348
return recs
23492349

2350-
cdef _fill_structured_column(char *dst, char* src, size_t elsize,
2351-
size_t stride, size_t length, bint incref):
2350+
cdef _fill_structured_column(char *dst, char* src, int64_t elsize,
2351+
int64_t stride, int64_t length, bint incref):
23522352
cdef:
2353-
size_t i
2353+
int64_t i
23542354

23552355
if incref:
23562356
util.transfer_object_column(dst, src, stride, length)

0 commit comments

Comments
 (0)