@@ -3,6 +3,10 @@ import numpy as np
3
3
4
4
cnp.import_array()
5
5
6
+ from khash cimport *
7
+
8
+ from cpython cimport PyString_FromString, Py_INCREF, PyString_AsString
9
+
6
10
cdef extern from " Python.h" :
7
11
ctypedef struct FILE
8
12
FILE* PyFile_AsFile(object )
@@ -143,8 +147,10 @@ cdef class TextReader:
143
147
144
148
cdef public:
145
149
object delimiter, na_values, converters, thousands, delim_whitespace
150
+ object memory_map
146
151
147
- def __cinit__ (self , source , delimiter = ' ,' , header = 0 , memory_map = False ,
152
+ def __cinit__ (self , source , delimiter = ' ,' , header = 0 ,
153
+ memory_map = False ,
148
154
chunksize = DEFAULT_CHUNKSIZE,
149
155
delim_whitespace = False ,
150
156
na_values = None ,
@@ -173,6 +179,7 @@ cdef class TextReader:
173
179
self .delimiter = delimiter
174
180
self .delim_whitespace = delim_whitespace
175
181
182
+ self .memory_map = memory_map
176
183
self .na_values = na_values
177
184
self .converters = converters
178
185
self .thousands = thousands
@@ -185,16 +192,22 @@ cdef class TextReader:
185
192
self .file_handle.close()
186
193
187
194
cdef _setup_parser_source(self , source):
188
- cdef int status
195
+ cdef:
196
+ int status
189
197
190
198
if isinstance (source, (basestring , file )):
191
199
if isinstance (source, basestring ):
192
200
source = open (source, ' rb' )
193
201
self .should_close = True
194
202
195
203
self .file_handle = source
196
- status = parser_file_source_init(self .parser,
197
- PyFile_AsFile(source))
204
+
205
+ if self .memory_map:
206
+ status = parser_mmap_init(self .parser,
207
+ PyFile_AsFile(source))
208
+ else :
209
+ status = parser_file_source_init(self .parser,
210
+ PyFile_AsFile(source))
198
211
199
212
if status != 0 :
200
213
raise Exception (' Initializing from file failed' )
@@ -209,7 +222,8 @@ cdef class TextReader:
209
222
raise ValueError (' Only ascii/bytes supported at the moment' )
210
223
211
224
status = parser_array_source_init(self .parser,
212
- < char * > bytes, len (bytes))
225
+ PyString_AsString(bytes),
226
+ len (bytes))
213
227
if status != 0 :
214
228
raise Exception (' Initializing parser from file-like '
215
229
' object failed' )
@@ -248,6 +262,10 @@ cdef class TextReader:
248
262
for i in range (ncols):
249
263
col_res = _try_double(self .parser, i, 0 , self .parser.lines)
250
264
265
+ if col_res is None :
266
+ col_res = _string_box_factorize(self .parser, i,
267
+ 0 , self .parser.lines)
268
+
251
269
results[i] = col_res
252
270
253
271
return results
@@ -256,6 +274,54 @@ class CParserError(Exception):
256
274
pass
257
275
258
276
277
+ # ----------------------------------------------------------------------
278
+ # Type conversions / inference support code
279
+
280
+ cdef _string_box_factorize(parser_t * parser, int col,
281
+ int line_start, int line_end):
282
+ cdef:
283
+ int error
284
+ Py_ssize_t i
285
+ size_t lines
286
+ coliter_t it
287
+ char * word
288
+ cnp.ndarray[object ] result
289
+
290
+ int ret = 0
291
+ kh_strbox_t * table
292
+ kh_iter_t
293
+
294
+ object pyval
295
+
296
+
297
+ table = kh_init_strbox()
298
+
299
+ lines = line_end - line_start
300
+ result = np.empty(lines, dtype = np.object_)
301
+
302
+ coliter_setup(& it, parser, col)
303
+ for i in range (lines):
304
+ word = COLITER_NEXT(it)
305
+
306
+ k = kh_get_strbox(table, word)
307
+
308
+ # in the hash table
309
+ if k != table.n_buckets:
310
+ # this increments the refcount, but need to test
311
+ pyval = < object > table.vals[k]
312
+ else :
313
+ # box it. new ref?
314
+ pyval = PyString_FromString(word)
315
+
316
+ k = kh_put_strbox(table, word, & ret)
317
+ table.vals[k] = < PyObject* > pyval
318
+
319
+ result[i] = pyval
320
+
321
+ return result
322
+
323
+
324
+
259
325
cdef _try_double(parser_t * parser, int col, int line_start, int line_end):
260
326
cdef:
261
327
int error
0 commit comments