@@ -8,17 +8,15 @@ import sas_constants as const
8
8
# algorithm. It is partially documented here:
9
9
#
10
10
# https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
11
- def _rle_decompress (int result_length , np.ndarray[uint8_t , ndim = 1 ] inbuff):
11
+ cdef np.ndarray[uint8_t, ndim = 1 ] rle_decompress (int result_length, np.ndarray[uint8_t, ndim= 1 ] inbuff):
12
12
13
- cdef uint8_t control_byte
14
- cdef uint8_t [:] result = np.zeros(result_length, np.uint8)
15
-
16
- cdef int rpos = 0
17
- cdef int ipos = 0
18
- cdef int i
19
- cdef int nbytes
20
- cdef uint8_t x
21
- cdef length = len (inbuff)
13
+ cdef:
14
+ uint8_t control_byte, x
15
+ np.ndarray[uint8_t, ndim= 1 ] result = np.zeros(result_length, np.uint8)
16
+ int rpos = 0
17
+ int ipos = 0
18
+ int i, nbytes
19
+ length = len (inbuff)
22
20
23
21
while ipos < length:
24
22
control_byte = inbuff[ipos] & 0xF0
@@ -107,24 +105,22 @@ def _rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
107
105
if len (result) != result_length:
108
106
print (" RLE: %v != %v \n " , (len (result), result_length))
109
107
110
- return np.asarray(result).tostring( )
108
+ return np.asarray(result, dtype = np.uint8 )
111
109
112
110
113
111
# rdc_decompress decompresses data using the Ross Data Compression algorithm:
114
112
#
115
113
# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
116
- def _rdc_decompress (int result_length , np.ndarray[uint8_t , ndim = 1 ] inbuff):
117
-
118
- cdef uint8_t cmd
119
- cdef uint16_t ctrl_bits
120
- cdef uint16_t ctrl_mask = 0
121
- cdef uint16_t ofs
122
- cdef uint16_t cnt
123
- cdef int ipos = 0
124
- cdef int rpos = 0
125
- cdef int k
114
+ cdef np.ndarray[uint8_t, ndim= 1 ] rdc_decompress(int result_length, np.ndarray[uint8_t, ndim= 1 ] inbuff):
126
115
127
- cdef uint8_t [:] outbuff = np.zeros(result_length, dtype = np.uint8)
116
+ cdef:
117
+ uint8_t cmd, ofs, cnt
118
+ uint16_t ctrl_bits
119
+ uint16_t ctrl_mask = 0
120
+ int ipos = 0
121
+ int rpos = 0
122
+ int k
123
+ np.ndarray[uint8_t, ndim= 1 ] outbuff = np.zeros(result_length, dtype = np.uint8)
128
124
129
125
ii = - 1
130
126
@@ -191,24 +187,33 @@ def _rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
191
187
if len (outbuff) != result_length:
192
188
raise ValueError (" RDC: %v != %v \n " , len (outbuff), result_length)
193
189
194
- return np.asarray(outbuff).tostring( )
190
+ return np.asarray(outbuff, dtype = np.uint8 )
195
191
196
192
197
- def _do_read (parser , int nrows ):
193
+ cdef np.ndarray[uint8_t, ndim= 1 ] decompress(object parser, int row_length, uint8_t[:] page):
194
+ page = np.frombuffer(page, dtype = np.uint8)
195
+ if parser.compression == const.rle_compression:
196
+ return rle_decompress(row_length, page)
197
+ elif parser.compression == const.rdc_compression:
198
+ return rdc_decompress(row_length, page)
199
+ else :
200
+ raise ValueError (" unknown SAS compression method: %s " %
201
+ parser.compression)
202
+
203
+
204
+ def do_read (object parser , int nrows ):
198
205
cdef int i
199
206
200
207
for i in range (nrows):
201
- done = _readline (parser)
208
+ done = readline (parser)
202
209
if done:
203
210
break
204
211
205
212
206
- def _readline ( parser ):
213
+ cdef bint readline( object parser):
207
214
208
- cdef int offset
209
- cdef int bit_offset
210
- cdef int align_correction
211
- cdef int subheader_pointer_length
215
+ cdef:
216
+ int offset, bit_offset, align_correction, subheader_pointer_length
212
217
213
218
bit_offset = parser._page_bit_offset
214
219
subheader_pointer_length = parser._subheader_pointer_length
@@ -236,9 +241,7 @@ def _readline(parser):
236
241
parser._current_row_on_page_index])
237
242
process_byte_array_with_data(parser,
238
243
current_subheader_pointer.offset,
239
- current_subheader_pointer.length,
240
- parser._byte_chunk,
241
- parser._string_chunk)
244
+ current_subheader_pointer.length)
242
245
return False
243
246
elif parser._current_page_type in const.page_mix_types:
244
247
align_correction = (bit_offset + const.subheader_pointers_offset +
@@ -250,9 +253,7 @@ def _readline(parser):
250
253
offset += (parser._current_page_subheaders_count *
251
254
subheader_pointer_length)
252
255
offset += parser._current_row_on_page_index * parser.row_length
253
- process_byte_array_with_data(parser, offset, parser.row_length,
254
- parser._byte_chunk,
255
- parser._string_chunk)
256
+ process_byte_array_with_data(parser, offset, parser.row_length)
256
257
mn = min (parser.row_count, parser._mix_page_row_count)
257
258
if parser._current_row_on_page_index == mn:
258
259
done = parser._read_next_page()
@@ -266,8 +267,7 @@ def _readline(parser):
266
267
const.subheader_pointers_offset +
267
268
parser._current_row_on_page_index *
268
269
parser.row_length,
269
- parser.row_length, parser._byte_chunk,
270
- parser._string_chunk)
270
+ parser.row_length)
271
271
flag = (parser._current_row_on_page_index ==
272
272
parser._current_page_block_count)
273
273
if flag:
@@ -281,25 +281,20 @@ def _readline(parser):
281
281
parser._current_page_type)
282
282
283
283
284
- def process_byte_array_with_data (parser , int offset , int length , uint8_t[:, ::1] byte_chunk ,
285
- object[:, ::1] string_chunk ):
286
-
287
- cdef int s
288
- cdef int j
289
- cdef int k
290
- cdef int m
291
- cdef int start
292
- cdef int jb
293
- cdef int js
294
- cdef int lngt
284
+ cdef void process_byte_array_with_data(object parser, int offset, int length):
295
285
296
- cdef long [:] lengths = parser._column_data_lengths
297
- cdef long [:] offsets = parser._column_data_offsets
298
- cdef char [:] column_types = parser.column_types
286
+ cdef:
287
+ int s, j, k, m, start, jb, js, lngt
288
+ long [:] lengths = parser._column_data_lengths
289
+ long [:] offsets = parser._column_data_offsets
290
+ char [:] column_types = parser.column_types
291
+ uint8_t[:, :] byte_chunk = parser._byte_chunk
292
+ # object[:, :] string_chunk = parser._string_chunk
299
293
300
- source = parser._cached_page[offset:offset+ length]
294
+ source = np.frombuffer( parser._cached_page[offset:offset+ length], dtype = np.uint8)
301
295
if (parser.compression != " " ) and (length < parser.row_length):
302
- source = parser._decompress(parser.row_length, source)
296
+ source = decompress(parser, parser.row_length, source)
297
+ return
303
298
304
299
s = 8 * parser._current_row_in_chunk_index
305
300
js = 0
@@ -318,10 +313,10 @@ def process_byte_array_with_data(parser, int offset, int length, uint8_t[:, ::1]
318
313
byte_chunk[jb, m + k] = source[start + k]
319
314
jb += 1
320
315
elif column_types[j] == b' s' :
321
- string_chunk [js, parser._current_row_in_chunk_index] = bytes( source[start:start+ lngt] )
316
+ parser._string_chunk [js][ parser._current_row_in_chunk_index] = source[start:( start+ lngt)].tostring().rstrip( )
322
317
js += 1
323
318
else :
324
- raise ValueError (" unknown column type: %s " % parser.columns[j].ctype)
319
+ raise ValueError (" unknown column type: %s " % parser.columns[j].ctype)
325
320
326
321
parser._current_row_on_page_index += 1
327
322
parser._current_row_in_chunk_index += 1
0 commit comments