Skip to content

Commit d444ffa

Browse files
committed
PERF: some more perf/clean in saslib.pyx
closes #12961
1 parent f1bb9e3 commit d444ffa

File tree

2 files changed

+74
-38
lines changed

2 files changed

+74
-38
lines changed

doc/source/whatsnew/v0.18.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ Deprecations
402402
Performance Improvements
403403
~~~~~~~~~~~~~~~~~~~~~~~~
404404

405-
- Improved speed of SAS reader (:issue:`12656`)
405+
- Improved speed of SAS reader (:issue:`12656`, :issue`12961`)
406406
- Performance improvements in ``.groupby(..).cumcount()`` (:issue:`11039`)
407407
- Improved memory usage in ``pd.read_csv()`` when using ``skiprows=an_integer`` (:issue:`13005`)
408408

pandas/io/sas/saslib.pyx

+73-37
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# cython: profile=False
2+
# cython: boundscheck=False, initializedcheck=False
3+
14
import numpy as np
25
cimport numpy as np
36
from numpy cimport uint8_t, uint16_t, int8_t, int64_t
@@ -10,19 +13,19 @@ import sas_constants as const
1013
cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
1114

1215
cdef:
13-
uint8_t control_byte, x, end_of_first_byte
16+
uint8_t control_byte, x
1417
uint8_t [:] result = np.zeros(result_length, np.uint8)
15-
int rpos = 0, ipos = 0, i, nbytes, length = len(inbuff)
18+
int rpos = 0, ipos = 0, i, nbytes, end_of_first_byte, length = len(inbuff)
1619

1720
while ipos < length:
1821
control_byte = inbuff[ipos] & 0xF0
19-
end_of_first_byte = int(inbuff[ipos] & 0x0F)
22+
end_of_first_byte = <int>(inbuff[ipos] & 0x0F)
2023
ipos += 1
2124

2225
if control_byte == 0x00:
2326
if end_of_first_byte != 0:
24-
print("Unexpected non-zero end_of_first_byte")
25-
nbytes = int(inbuff[ipos]) + 64
27+
raise ValueError("Unexpected non-zero end_of_first_byte")
28+
nbytes = <int>(inbuff[ipos]) + 64
2629
ipos += 1
2730
for i in range(nbytes):
2831
result[rpos] = inbuff[ipos]
@@ -31,20 +34,20 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui
3134
elif control_byte == 0x40:
3235
# not documented
3336
nbytes = end_of_first_byte * 16
34-
nbytes += int(inbuff[ipos])
37+
nbytes += <int>(inbuff[ipos])
3538
ipos += 1
3639
for i in range(nbytes):
3740
result[rpos] = inbuff[ipos]
3841
rpos += 1
3942
ipos += 1
4043
elif control_byte == 0x60:
41-
nbytes = end_of_first_byte*256 + int(inbuff[ipos]) + 17
44+
nbytes = end_of_first_byte*256 + <int>(inbuff[ipos]) + 17
4245
ipos += 1
4346
for i in range(nbytes):
4447
result[rpos] = 0x20
4548
rpos += 1
4649
elif control_byte == 0x70:
47-
nbytes = end_of_first_byte*256 + int(inbuff[ipos]) + 17
50+
nbytes = end_of_first_byte*256 + <int>(inbuff[ipos]) + 17
4851
ipos += 1
4952
for i in range(nbytes):
5053
result[rpos] = 0x00
@@ -99,7 +102,7 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui
99102
raise ValueError("unknown control byte: %v", control_byte)
100103

101104
if len(result) != result_length:
102-
print("RLE: %v != %v\n", (len(result), result_length))
105+
raise ValueError("RLE: %v != %v", (len(result), result_length))
103106

104107
return np.asarray(result)
105108

@@ -162,7 +165,7 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
162165
ipos += 1
163166
cnt += 16
164167
for k in range(cnt):
165-
outbuff[rpos + k] = outbuff[rpos - int(ofs) + k]
168+
outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
166169
rpos += cnt
167170

168171
# short pattern
@@ -171,7 +174,7 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
171174
ofs += <uint16_t>inbuff[ipos] << 4
172175
ipos += 1
173176
for k in range(cmd):
174-
outbuff[rpos + k] = outbuff[rpos - int(ofs) + k]
177+
outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
175178
rpos += cmd
176179

177180
else:
@@ -182,6 +185,17 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
182185

183186
return np.asarray(outbuff)
184187

188+
cdef enum ColumnTypes:
189+
column_type_decimal = 1
190+
column_type_string = 2
191+
192+
193+
# type the page_data types
194+
cdef int page_meta_type = const.page_meta_type
195+
cdef int page_mix_types_0 = const.page_mix_types[0]
196+
cdef int page_mix_types_1 = const.page_mix_types[1]
197+
cdef int page_data_type = const.page_data_type
198+
cdef int subheader_pointers_offset = const.subheader_pointers_offset
185199

186200
cdef class Parser(object):
187201

@@ -194,11 +208,16 @@ cdef class Parser(object):
194208
object[:, :] string_chunk
195209
char *cached_page
196210
int current_row_on_page_index
211+
int current_page_block_count
212+
int current_page_data_subheader_pointers_len
213+
int current_page_subheaders_count
197214
int current_row_in_chunk_index
198215
int current_row_in_file_index
216+
int header_length
199217
int row_length
200218
int bit_offset
201219
int subheader_pointer_length
220+
int current_page_type
202221
bint is_little_endian
203222
np.ndarray[uint8_t, ndim=1] (*decompress)(int result_length, np.ndarray[uint8_t, ndim=1] inbuff)
204223
object parser
@@ -208,30 +227,30 @@ cdef class Parser(object):
208227
int j
209228
char[:] column_types
210229

211-
self.current_row_on_page_index = parser._current_row_on_page_index
212-
self.current_row_in_chunk_index = parser._current_row_in_chunk_index
213-
self.current_row_in_file_index = parser._current_row_in_file_index
214230
self.parser = parser
231+
self.header_length = self.parser.header_length
215232
self.column_count = parser.column_count
216233
self.lengths = parser._column_data_lengths
217234
self.offsets = parser._column_data_offsets
218235
self.byte_chunk = parser._byte_chunk
219236
self.string_chunk = parser._string_chunk
220237
self.row_length = parser.row_length
221-
self.cached_page = <char *>parser._cached_page
222238
self.bit_offset = self.parser._page_bit_offset
223239
self.subheader_pointer_length = self.parser._subheader_pointer_length
224240
self.is_little_endian = parser.byte_order == "<"
225241
self.column_types = np.empty(self.column_count, dtype='int64')
226242

243+
# page indicators
244+
self.update_next_page()
245+
227246
column_types = parser.column_types
228247

229248
# map column types
230249
for j in range(self.column_count):
231250
if column_types[j] == b'd':
232-
self.column_types[j] = 1
251+
self.column_types[j] = column_type_decimal
233252
elif column_types[j] == b's':
234-
self.column_types[j] = 2
253+
self.column_types[j] = column_type_string
235254
else:
236255
raise ValueError("unknown column type: %s" % self.parser.columns[j].ctype)
237256

@@ -243,6 +262,11 @@ cdef class Parser(object):
243262
else:
244263
self.decompress = NULL
245264

265+
# update to current state of the parser
266+
self.current_row_in_chunk_index = parser._current_row_in_chunk_index
267+
self.current_row_in_file_index = parser._current_row_in_file_index
268+
self.current_row_on_page_index = parser._current_row_on_page_index
269+
246270
def read(self, int nrows):
247271
cdef:
248272
bint done
@@ -265,31 +289,39 @@ cdef class Parser(object):
265289
if done:
266290
self.cached_page = NULL
267291
else:
268-
self.cached_page = <char *>self.parser._cached_page
269-
self.current_row_on_page_index = 0
292+
self.update_next_page()
270293
return done
271294

295+
cdef update_next_page(self):
296+
# update data for the current page
297+
298+
self.cached_page = <char *>self.parser._cached_page
299+
self.current_row_on_page_index = 0
300+
self.current_page_type = self.parser._current_page_type
301+
self.current_page_block_count = self.parser._current_page_block_count
302+
self.current_page_data_subheader_pointers_len = len(self.parser._current_page_data_subheader_pointers)
303+
self.current_page_subheaders_count = self.parser._current_page_subheaders_count
304+
272305
cdef bint readline(self):
273306

274307
cdef:
275-
int offset, bit_offset, align_correction, subheader_pointer_length
308+
int offset, bit_offset, align_correction, subheader_pointer_length, mn
276309
bint done, flag
277310

278311
bit_offset = self.bit_offset
279312
subheader_pointer_length = self.subheader_pointer_length
280313

281314
# If there is no page, go to the end of the header and read a page.
282315
if self.cached_page == NULL:
283-
self.parser._path_or_buf.seek(self.parser.header_length)
316+
self.parser._path_or_buf.seek(self.header_length)
284317
done = self.read_next_page()
285318
if done:
286319
return True
287320

288321
# Loop until a data row is read
289322
while True:
290-
if self.parser._current_page_type == const.page_meta_type:
291-
flag = (self.current_row_on_page_index >=
292-
len(self.parser._current_page_data_subheader_pointers))
323+
if self.current_page_type == page_meta_type:
324+
flag = self.current_row_on_page_index >= self.current_page_data_subheader_pointers_len
293325
if flag:
294326
done = self.read_next_page()
295327
if done:
@@ -301,14 +333,14 @@ cdef class Parser(object):
301333
self.process_byte_array_with_data(current_subheader_pointer.offset,
302334
current_subheader_pointer.length)
303335
return False
304-
elif self.parser._current_page_type in const.page_mix_types:
305-
align_correction = (bit_offset + const.subheader_pointers_offset +
306-
self.parser._current_page_subheaders_count *
336+
elif self.current_page_type == page_mix_types_0 or self.current_page_type == page_mix_types_1:
337+
align_correction = (bit_offset + subheader_pointers_offset +
338+
self.current_page_subheaders_count *
307339
subheader_pointer_length)
308340
align_correction = align_correction % 8
309341
offset = bit_offset + align_correction
310-
offset += const.subheader_pointers_offset
311-
offset += (self.parser._current_page_subheaders_count *
342+
offset += subheader_pointers_offset
343+
offset += (self.current_page_subheaders_count *
312344
subheader_pointer_length)
313345
offset += self.current_row_on_page_index * self.row_length
314346
self.process_byte_array_with_data(offset,
@@ -319,27 +351,29 @@ cdef class Parser(object):
319351
if done:
320352
return True
321353
return False
322-
elif self.parser._current_page_type == const.page_data_type:
354+
elif self.current_page_type == page_data_type:
323355
self.process_byte_array_with_data(bit_offset +
324-
const.subheader_pointers_offset +
356+
subheader_pointers_offset +
325357
self.current_row_on_page_index *
326358
self.row_length,
327359
self.row_length)
328360
flag = (self.current_row_on_page_index ==
329-
self.parser._current_page_block_count)
361+
self.current_page_block_count)
330362
if flag:
331363
done = self.read_next_page()
332364
if done:
333365
return True
334366
return False
335367
else:
336368
raise ValueError("unknown page type: %s",
337-
self.parser._current_page_type)
369+
self.current_page_type)
338370

339371
cdef void process_byte_array_with_data(self, int offset, int length):
340372

341373
cdef:
342-
int s, j, k, m, jb, js, lngt, start
374+
Py_ssize_t j
375+
int s, k, m, jb, js, current_row
376+
int64_t lngt, start, ct
343377
np.ndarray[uint8_t, ndim=1] source
344378
int64_t[:] column_types
345379
int64_t[:] lengths
@@ -352,6 +386,7 @@ cdef class Parser(object):
352386
if self.decompress != NULL and (length < self.row_length):
353387
source = self.decompress(self.row_length, source)
354388

389+
current_row = self.current_row_in_chunk_index
355390
column_types = self.column_types
356391
lengths = self.lengths
357392
offsets = self.offsets
@@ -365,7 +400,8 @@ cdef class Parser(object):
365400
if lngt == 0:
366401
break
367402
start = offsets[j]
368-
if column_types[j] == 1:
403+
ct = column_types[j]
404+
if ct == column_type_decimal:
369405
# decimal
370406
if self.is_little_endian:
371407
m = s + 8 - lngt
@@ -374,9 +410,9 @@ cdef class Parser(object):
374410
for k in range(lngt):
375411
byte_chunk[jb, m + k] = source[start + k]
376412
jb += 1
377-
elif column_types[j] == 2:
413+
elif column_types[j] == column_type_string:
378414
# string
379-
string_chunk[js, self.current_row_in_chunk_index] = source[start:(start+lngt)].tostring().rstrip()
415+
string_chunk[js, current_row] = source[start:(start+lngt)].tostring().rstrip()
380416
js += 1
381417

382418
self.current_row_on_page_index += 1

0 commit comments

Comments
 (0)