Skip to content

Commit a7df841

Browse files
committed
Modest performance, address #12647
1 parent 797baf9 commit a7df841

File tree

2 files changed

+32
-27
lines changed

2 files changed

+32
-27
lines changed

pandas/io/sas/sas7bdat.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -327,12 +327,12 @@ def _get_properties(self):
327327
_os_version_number_length)
328328
self.os_version = buf.rstrip(b'\x00 ').decode()
329329

330-
buf = self._read_bytes(
331-
_os_name_offset, _os_name_length).rstrip(b'\x00 ')
330+
buf = self._read_bytes(_os_name_offset, _os_name_length)
331+
buf = buf.rstrip(b'\x00 ')
332332
if len(buf) > 0:
333-
self.os_name = buf.rstrip(b'\x00 ').decode()
333+
self.os_name = buf.decode()
334334
else:
335-
buf = self._path_or_buf.read(_os_maker_offset, _os_maker_length)
335+
buf = self._read_bytes(_os_maker_offset, _os_maker_length)
336336
self.os_name = buf.rstrip(b'\x00 ').decode()
337337

338338
# Read a single float of the given width (4 or 8).
@@ -592,6 +592,10 @@ def _process_columnattributes_subheader(self, offset, length):
592592
length - 2 * int_len - 12) // (int_len + 8)
593593
self.column_types = np.empty(
594594
column_attributes_vectors_count, dtype=np.dtype('S1'))
595+
self._column_data_lengths = np.empty(
596+
column_attributes_vectors_count, dtype=np.int64)
597+
self._column_data_offsets = np.empty(
598+
column_attributes_vectors_count, dtype=np.int64)
595599
for i in range(column_attributes_vectors_count):
596600
col_data_offset = (offset + int_len +
597601
_column_data_offset_offset + i * (int_len + 8))
@@ -600,11 +604,11 @@ def _process_columnattributes_subheader(self, offset, length):
600604
col_types = (offset + 2 * int_len +
601605
_column_type_offset + i * (int_len + 8))
602606

603-
self._column_data_offsets.append(
604-
self._read_int(col_data_offset, int_len))
607+
x = self._read_int(col_data_offset, int_len)
608+
self._column_data_offsets[i] = x
605609

606610
x = self._read_int(col_data_len, _column_data_length_length)
607-
self._column_data_lengths.append(x)
611+
self._column_data_lengths[i] = x
608612

609613
x = self._read_int(col_types, _column_type_length)
610614
if x == 1:

pandas/io/sas/saslib.pyx

+21-20
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import numpy as np
22
cimport numpy as np
3-
from numpy cimport uint8_t, uint16_t
3+
from numpy cimport uint8_t, uint16_t, int8_t
44

55
# rle_decompress decompresses data using a Run Length Encoding
66
# algorithm. It is partially documented here:
@@ -191,43 +191,44 @@ def _rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
191191

192192
return np.asarray(outbuff).tostring()
193193

194-
def process_byte_array_with_data(parser, int offset, int length, np.ndarray[uint8_t, ndim=2] byte_chunk,
195-
np.ndarray[dtype=object, ndim=2] string_chunk):
194+
def process_byte_array_with_data(parser, int offset, int length, uint8_t[:, ::1] byte_chunk,
195+
object[:, ::1] string_chunk):
196196

197197
cdef int s
198198
cdef int j
199+
cdef int k
199200
cdef int m
200201
cdef int start
201-
cdef int end
202-
cdef bytes source
203-
cdef bytes temp
204202
cdef int jb
205203
cdef int js
204+
cdef int lngt
205+
206+
cdef long[:] lengths = parser._column_data_lengths
207+
cdef long[:] offsets = parser._column_data_offsets
208+
cdef char[:] column_types = parser.column_types
206209

210+
source = parser._cached_page[offset:offset+length]
207211
if (parser.compression != "") and (length < parser.row_length):
208-
source = parser._decompress(parser.row_length, parser._cached_page[offset:offset + length])
209-
else:
210-
source = parser._cached_page[offset:offset + length]
212+
source = parser._decompress(parser.row_length, source)
211213

212214
s = 8 * parser._current_row_in_chunk_index
213215
js = 0
214216
jb = 0
215217
for j in range(parser.column_count):
216-
length = parser._column_data_lengths[j]
217-
if length == 0:
218+
lngt = lengths[j]
219+
if lngt == 0:
218220
break
219-
start = parser._column_data_offsets[j]
220-
end = start + length
221-
temp = source[start:end]
222-
if parser.column_types[j] == b'd':
223-
m = 8 - length
221+
start = offsets[j]
222+
if column_types[j] == b'd':
224223
if parser.byte_order == "<":
225-
byte_chunk[jb, s+m:s+8] = np.frombuffer(temp, dtype=np.uint8)
224+
m = s + 8 - lngt
226225
else:
227-
byte_chunk[jb, s:s+length] = np.frombuffer(temp, dtype=np.uint8)
226+
m = s
227+
for k in range(lngt):
228+
byte_chunk[jb, m + k] = source[start + k]
228229
jb += 1
229-
elif parser.column_types[j] == b's':
230-
string_chunk[js, parser._current_row_in_chunk_index] = bytes(temp)
230+
elif column_types[j] == b's':
231+
string_chunk[js, parser._current_row_in_chunk_index] = bytes(source[start:start+lngt])
231232
js += 1
232233
else:
233234
raise ValueError("unknown column type: %s" % parser.columns[j].ctype)

0 commit comments

Comments
 (0)