1
- cimport numpy as cnp
1
+ from numpy cimport *
2
2
import numpy as np
3
3
4
- cnp. import_array()
4
+ import_array()
5
5
6
6
from khash cimport *
7
7
8
8
from cpython cimport PyString_FromString, Py_INCREF, PyString_AsString
9
9
10
+ cdef extern from " stdint.h" :
11
+ enum : INT64_MAX
12
+ enum : INT64_MIN
13
+ enum : INT32_MAX
14
+ enum : INT32_MIN
15
+
16
+
10
17
cdef extern from " Python.h" :
11
18
ctypedef struct FILE
12
19
FILE* PyFile_AsFile(object )
@@ -132,6 +139,12 @@ cdef extern from "parser/common.h":
132
139
int tokenize_all_rows(parser_t * self )
133
140
int tokenize_nrows(parser_t * self , size_t nrows)
134
141
142
+ int64_t str_to_int64(char * p_item, int64_t int_min,
143
+ int64_t int_max, int * error)
144
+ uint64_t str_to_uint64(char * p_item, uint64_t uint_max, int * error)
145
+
146
+
147
+
135
148
DEFAULT_CHUNKSIZE = 256 * 1024
136
149
137
150
cdef class TextReader:
@@ -144,6 +157,7 @@ cdef class TextReader:
144
157
cdef:
145
158
parser_t * parser
146
159
object file_handle, should_close
160
+ bint factorize
147
161
148
162
cdef public:
149
163
object delimiter, na_values, converters, thousands, delim_whitespace
@@ -155,7 +169,8 @@ cdef class TextReader:
155
169
delim_whitespace = False ,
156
170
na_values = None ,
157
171
converters = None ,
158
- thousands = None ):
172
+ thousands = None ,
173
+ factorize = True ):
159
174
self .parser = parser_new()
160
175
self .parser.chunksize = chunksize
161
176
@@ -171,6 +186,8 @@ cdef class TextReader:
171
186
raise ValueError (' only length-1 separators excluded right now' )
172
187
self .parser.delimiter = (< char * > delimiter)[0 ]
173
188
189
+ self .factorize = factorize
190
+
174
191
# TODO: no header vs. header is not the first row
175
192
self .parser.header = header
176
193
@@ -255,16 +272,27 @@ cdef class TextReader:
255
272
def _convert_column_data (self ):
256
273
cdef:
257
274
Py_ssize_t i, ncols
275
+ cast_func func
258
276
259
277
ncols = self .parser.line_fields[0 ]
260
278
261
279
results = {}
262
280
for i in range (ncols):
263
- col_res = _try_double(self .parser, i, 0 , self .parser.lines)
281
+ col_res = None
282
+ for func in cast_func_order:
283
+ col_res = func(self .parser, i, 0 , self .parser.lines)
284
+ if col_res is not None :
285
+ results[i] = col_res
286
+ break
264
287
265
288
if col_res is None :
266
- col_res = _string_box_factorize(self .parser, i,
267
- 0 , self .parser.lines)
289
+ raise Exception (' Unable to parse column %d ' % i)
290
+
291
+ # col_res = _try_double(self.parser, i, 0, self.parser.lines)
292
+
293
+ # if col_res is None:
294
+ # col_res = _string_box_factorize(self.parser, i,
295
+ # 0, self.parser.lines)
268
296
269
297
results[i] = col_res
270
298
@@ -277,6 +305,9 @@ class CParserError(Exception):
277
305
# ----------------------------------------------------------------------
278
306
# Type conversions / inference support code
279
307
308
+ ctypedef object (* cast_func)(parser_t * parser, int col,
309
+ int line_start, int line_end)
310
+
280
311
cdef _string_box_factorize(parser_t * parser, int col,
281
312
int line_start, int line_end):
282
313
cdef:
@@ -285,7 +316,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
285
316
size_t lines
286
317
coliter_t it
287
318
char * word
288
- cnp. ndarray[object ] result
319
+ ndarray[object ] result
289
320
290
321
int ret = 0
291
322
kh_strbox_t * table
@@ -329,7 +360,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end):
329
360
coliter_t it
330
361
char * word
331
362
double * data
332
- cnp. ndarray result
363
+ ndarray result
333
364
334
365
lines = line_end - line_start
335
366
@@ -349,6 +380,37 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end):
349
380
350
381
return result
351
382
383
+ cdef _try_int64(parser_t * parser, int col, int line_start, int line_end):
384
+ cdef:
385
+ int error
386
+ size_t i, lines
387
+ coliter_t it
388
+ char * word
389
+ int64_t * data
390
+ ndarray result
391
+
392
+ lines = line_end - line_start
393
+
394
+ result = np.empty(lines, dtype = np.int64)
395
+
396
+ data = < int64_t * > result.data
397
+
398
+ coliter_setup(& it, parser, col)
399
+ for i in range (lines):
400
+ word = COLITER_NEXT(it)
401
+ data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, & error);
402
+
403
+ if error != 0 :
404
+ return None
405
+
406
+ return result
407
+
408
+
409
+ cdef cast_func cast_func_order[3 ]
410
+ cast_func_order[0 ] = _try_int64
411
+ cast_func_order[1 ] = _try_double
412
+ cast_func_order[2 ] = _string_box_factorize
413
+
352
414
cdef raise_parser_error(object base, parser_t * parser):
353
415
message = ' %s . C error: ' % base
354
416
if parser.error_msg != NULL :
0 commit comments