1
+ # cython: profile=False
2
+ # cython: boundscheck=False, initializedcheck=False
3
+
1
4
import numpy as np
2
5
cimport numpy as np
3
6
from numpy cimport uint8_t, uint16_t, int8_t, int64_t
@@ -10,19 +13,19 @@ import sas_constants as const
10
13
cdef np.ndarray[uint8_t, ndim= 1 ] rle_decompress(int result_length, np.ndarray[uint8_t, ndim= 1 ] inbuff):
11
14
12
15
cdef:
13
- uint8_t control_byte, x, end_of_first_byte
16
+ uint8_t control_byte, x
14
17
uint8_t [:] result = np.zeros(result_length, np.uint8)
15
- int rpos = 0 , ipos = 0 , i, nbytes, length = len (inbuff)
18
+ int rpos = 0 , ipos = 0 , i, nbytes, end_of_first_byte, length = len (inbuff)
16
19
17
20
while ipos < length:
18
21
control_byte = inbuff[ipos] & 0xF0
19
- end_of_first_byte = int (inbuff[ipos] & 0x0F )
22
+ end_of_first_byte = < int > (inbuff[ipos] & 0x0F )
20
23
ipos += 1
21
24
22
25
if control_byte == 0x00 :
23
26
if end_of_first_byte != 0 :
24
- print (" Unexpected non-zero end_of_first_byte" )
25
- nbytes = int (inbuff[ipos]) + 64
27
+ raise ValueError (" Unexpected non-zero end_of_first_byte" )
28
+ nbytes = < int > (inbuff[ipos]) + 64
26
29
ipos += 1
27
30
for i in range (nbytes):
28
31
result[rpos] = inbuff[ipos]
@@ -31,20 +34,20 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui
31
34
elif control_byte == 0x40 :
32
35
# not documented
33
36
nbytes = end_of_first_byte * 16
34
- nbytes += int (inbuff[ipos])
37
+ nbytes += < int > (inbuff[ipos])
35
38
ipos += 1
36
39
for i in range (nbytes):
37
40
result[rpos] = inbuff[ipos]
38
41
rpos += 1
39
42
ipos += 1
40
43
elif control_byte == 0x60 :
41
- nbytes = end_of_first_byte* 256 + int (inbuff[ipos]) + 17
44
+ nbytes = end_of_first_byte* 256 + < int > (inbuff[ipos]) + 17
42
45
ipos += 1
43
46
for i in range (nbytes):
44
47
result[rpos] = 0x20
45
48
rpos += 1
46
49
elif control_byte == 0x70 :
47
- nbytes = end_of_first_byte* 256 + int (inbuff[ipos]) + 17
50
+ nbytes = end_of_first_byte* 256 + < int > (inbuff[ipos]) + 17
48
51
ipos += 1
49
52
for i in range (nbytes):
50
53
result[rpos] = 0x00
@@ -99,7 +102,7 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui
99
102
raise ValueError (" unknown control byte: %v " , control_byte)
100
103
101
104
if len (result) != result_length:
102
- print (" RLE: %v != %v \n " , (len (result), result_length))
105
+ raise ValueError (" RLE: %v != %v " , (len (result), result_length))
103
106
104
107
return np.asarray(result)
105
108
@@ -162,7 +165,7 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
162
165
ipos += 1
163
166
cnt += 16
164
167
for k in range (cnt):
165
- outbuff[rpos + k] = outbuff[rpos - int ( ofs) + k]
168
+ outbuff[rpos + k] = outbuff[rpos - < int > ofs + k]
166
169
rpos += cnt
167
170
168
171
# short pattern
@@ -171,7 +174,7 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
171
174
ofs += < uint16_t> inbuff[ipos] << 4
172
175
ipos += 1
173
176
for k in range (cmd):
174
- outbuff[rpos + k] = outbuff[rpos - int ( ofs) + k]
177
+ outbuff[rpos + k] = outbuff[rpos - < int > ofs + k]
175
178
rpos += cmd
176
179
177
180
else :
@@ -182,6 +185,17 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
182
185
183
186
return np.asarray(outbuff)
184
187
188
+ cdef enum ColumnTypes:
189
+ column_type_decimal = 1
190
+ column_type_string = 2
191
+
192
+
193
+ # type the page_data types
194
+ cdef int page_meta_type = const.page_meta_type
195
+ cdef int page_mix_types_0 = const.page_mix_types[0 ]
196
+ cdef int page_mix_types_1 = const.page_mix_types[1 ]
197
+ cdef int page_data_type = const.page_data_type
198
+ cdef int subheader_pointers_offset = const.subheader_pointers_offset
185
199
186
200
cdef class Parser(object ):
187
201
@@ -194,11 +208,16 @@ cdef class Parser(object):
194
208
object [:, :] string_chunk
195
209
char * cached_page
196
210
int current_row_on_page_index
211
+ int current_page_block_count
212
+ int current_page_data_subheader_pointers_len
213
+ int current_page_subheaders_count
197
214
int current_row_in_chunk_index
198
215
int current_row_in_file_index
216
+ int header_length
199
217
int row_length
200
218
int bit_offset
201
219
int subheader_pointer_length
220
+ int current_page_type
202
221
bint is_little_endian
203
222
np.ndarray[uint8_t, ndim= 1 ] (* decompress)(int result_length, np.ndarray[uint8_t, ndim= 1 ] inbuff)
204
223
object parser
@@ -208,30 +227,30 @@ cdef class Parser(object):
208
227
int j
209
228
char [:] column_types
210
229
211
- self .current_row_on_page_index = parser._current_row_on_page_index
212
- self .current_row_in_chunk_index = parser._current_row_in_chunk_index
213
- self .current_row_in_file_index = parser._current_row_in_file_index
214
230
self .parser = parser
231
+ self .header_length = self .parser.header_length
215
232
self .column_count = parser.column_count
216
233
self .lengths = parser._column_data_lengths
217
234
self .offsets = parser._column_data_offsets
218
235
self .byte_chunk = parser._byte_chunk
219
236
self .string_chunk = parser._string_chunk
220
237
self .row_length = parser.row_length
221
- self .cached_page = < char * > parser._cached_page
222
238
self .bit_offset = self .parser._page_bit_offset
223
239
self .subheader_pointer_length = self .parser._subheader_pointer_length
224
240
self .is_little_endian = parser.byte_order == " <"
225
241
self .column_types = np.empty(self .column_count, dtype = ' int64' )
226
242
243
+ # page indicators
244
+ self .update_next_page()
245
+
227
246
column_types = parser.column_types
228
247
229
248
# map column types
230
249
for j in range (self .column_count):
231
250
if column_types[j] == b' d' :
232
- self .column_types[j] = 1
251
+ self .column_types[j] = column_type_decimal
233
252
elif column_types[j] == b' s' :
234
- self .column_types[j] = 2
253
+ self .column_types[j] = column_type_string
235
254
else :
236
255
raise ValueError (" unknown column type: %s " % self .parser.columns[j].ctype)
237
256
@@ -243,6 +262,11 @@ cdef class Parser(object):
243
262
else :
244
263
self .decompress = NULL
245
264
265
+ # update to current state of the parser
266
+ self .current_row_in_chunk_index = parser._current_row_in_chunk_index
267
+ self .current_row_in_file_index = parser._current_row_in_file_index
268
+ self .current_row_on_page_index = parser._current_row_on_page_index
269
+
246
270
def read (self , int nrows ):
247
271
cdef:
248
272
bint done
@@ -265,31 +289,39 @@ cdef class Parser(object):
265
289
if done:
266
290
self .cached_page = NULL
267
291
else :
268
- self .cached_page = < char * > self .parser._cached_page
269
- self .current_row_on_page_index = 0
292
+ self .update_next_page()
270
293
return done
271
294
295
+ cdef update_next_page(self ):
296
+ # update data for the current page
297
+
298
+ self .cached_page = < char * > self .parser._cached_page
299
+ self .current_row_on_page_index = 0
300
+ self .current_page_type = self .parser._current_page_type
301
+ self .current_page_block_count = self .parser._current_page_block_count
302
+ self .current_page_data_subheader_pointers_len = len (self .parser._current_page_data_subheader_pointers)
303
+ self .current_page_subheaders_count = self .parser._current_page_subheaders_count
304
+
272
305
cdef bint readline(self ):
273
306
274
307
cdef:
275
- int offset, bit_offset, align_correction, subheader_pointer_length
308
+ int offset, bit_offset, align_correction, subheader_pointer_length, mn
276
309
bint done, flag
277
310
278
311
bit_offset = self .bit_offset
279
312
subheader_pointer_length = self .subheader_pointer_length
280
313
281
314
# If there is no page, go to the end of the header and read a page.
282
315
if self .cached_page == NULL :
283
- self .parser._path_or_buf.seek(self .parser. header_length)
316
+ self .parser._path_or_buf.seek(self .header_length)
284
317
done = self .read_next_page()
285
318
if done:
286
319
return True
287
320
288
321
# Loop until a data row is read
289
322
while True :
290
- if self .parser._current_page_type == const.page_meta_type:
291
- flag = (self .current_row_on_page_index >=
292
- len (self .parser._current_page_data_subheader_pointers))
323
+ if self .current_page_type == page_meta_type:
324
+ flag = self .current_row_on_page_index >= self .current_page_data_subheader_pointers_len
293
325
if flag:
294
326
done = self .read_next_page()
295
327
if done:
@@ -301,14 +333,14 @@ cdef class Parser(object):
301
333
self .process_byte_array_with_data(current_subheader_pointer.offset,
302
334
current_subheader_pointer.length)
303
335
return False
304
- elif self .parser._current_page_type in const.page_mix_types :
305
- align_correction = (bit_offset + const. subheader_pointers_offset +
306
- self .parser._current_page_subheaders_count *
336
+ elif self .current_page_type == page_mix_types_0 or self .current_page_type == page_mix_types_1 :
337
+ align_correction = (bit_offset + subheader_pointers_offset +
338
+ self .current_page_subheaders_count *
307
339
subheader_pointer_length)
308
340
align_correction = align_correction % 8
309
341
offset = bit_offset + align_correction
310
- offset += const. subheader_pointers_offset
311
- offset += (self .parser._current_page_subheaders_count *
342
+ offset += subheader_pointers_offset
343
+ offset += (self .current_page_subheaders_count *
312
344
subheader_pointer_length)
313
345
offset += self .current_row_on_page_index * self .row_length
314
346
self .process_byte_array_with_data(offset,
@@ -319,27 +351,29 @@ cdef class Parser(object):
319
351
if done:
320
352
return True
321
353
return False
322
- elif self .parser._current_page_type == const. page_data_type:
354
+ elif self .current_page_type == page_data_type:
323
355
self .process_byte_array_with_data(bit_offset +
324
- const. subheader_pointers_offset +
356
+ subheader_pointers_offset +
325
357
self .current_row_on_page_index *
326
358
self .row_length,
327
359
self .row_length)
328
360
flag = (self .current_row_on_page_index ==
329
- self .parser._current_page_block_count )
361
+ self .current_page_block_count )
330
362
if flag:
331
363
done = self .read_next_page()
332
364
if done:
333
365
return True
334
366
return False
335
367
else :
336
368
raise ValueError (" unknown page type: %s " ,
337
- self .parser._current_page_type )
369
+ self .current_page_type )
338
370
339
371
cdef void process_byte_array_with_data(self , int offset, int length):
340
372
341
373
cdef:
342
- int s, j, k, m, jb, js, lngt, start
374
+ Py_ssize_t j
375
+ int s, k, m, jb, js, current_row
376
+ int64_t lngt, start, ct
343
377
np.ndarray[uint8_t, ndim= 1 ] source
344
378
int64_t[:] column_types
345
379
int64_t[:] lengths
@@ -352,6 +386,7 @@ cdef class Parser(object):
352
386
if self .decompress != NULL and (length < self .row_length):
353
387
source = self .decompress(self .row_length, source)
354
388
389
+ current_row = self .current_row_in_chunk_index
355
390
column_types = self .column_types
356
391
lengths = self .lengths
357
392
offsets = self .offsets
@@ -365,7 +400,8 @@ cdef class Parser(object):
365
400
if lngt == 0 :
366
401
break
367
402
start = offsets[j]
368
- if column_types[j] == 1 :
403
+ ct = column_types[j]
404
+ if ct == column_type_decimal:
369
405
# decimal
370
406
if self .is_little_endian:
371
407
m = s + 8 - lngt
@@ -374,9 +410,9 @@ cdef class Parser(object):
374
410
for k in range (lngt):
375
411
byte_chunk[jb, m + k] = source[start + k]
376
412
jb += 1
377
- elif column_types[j] == 2 :
413
+ elif column_types[j] == column_type_string :
378
414
# string
379
- string_chunk[js, self .current_row_in_chunk_index ] = source[start:(start+ lngt)].tostring().rstrip()
415
+ string_chunk[js, current_row ] = source[start:(start+ lngt)].tostring().rstrip()
380
416
js += 1
381
417
382
418
self .current_row_on_page_index += 1
0 commit comments