Skip to content

Commit 11c2f31

Browse files
committed
Further cythonization
1 parent 3ef626e commit 11c2f31

File tree

2 files changed

+31
-32
lines changed

2 files changed

+31
-32
lines changed

pandas/io/sas/sas7bdat.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -550,10 +550,7 @@ def read(self, nrows=None):
550550
nd = (self.column_types == b'd').sum()
551551
ns = (self.column_types == b's').sum()
552552

553-
self._string_chunk = []
554-
for j,ct in enumerate(self.column_types):
555-
if ct == b's':
556-
self._string_chunk.append([None] * nrows)
553+
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
557554
self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8)
558555

559556
self._current_row_in_chunk_index = 0
@@ -607,7 +604,7 @@ def _chunk_to_dataframe(self):
607604
rslt[name] = epoch + pd.to_timedelta(rslt[name], unit='d')
608605
jb += 1
609606
elif self.column_types[j] == b's':
610-
rslt[name] = pd.Series(self._string_chunk[js], dtype=np.object)
607+
rslt[name] = self._string_chunk[js, :]
611608
if self.convert_text and (self.encoding is not None):
612609
rslt[name] = rslt[name].str.decode(self.encoding)
613610
if self.blank_missing:

pandas/io/sas/saslib.pyx

+29-27
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,21 @@ cimport numpy as np
33
from numpy cimport uint8_t, uint16_t, int8_t
44
import sas_constants as const
55

6-
76
# rle_decompress decompresses data using a Run Length Encoding
87
# algorithm. It is partially documented here:
98
#
109
# https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
11-
cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
10+
cdef rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
1211

13-
cdef:
14-
uint8_t control_byte, x
15-
np.ndarray[uint8_t, ndim=1] result = np.zeros(result_length, np.uint8)
16-
int rpos = 0
17-
int ipos = 0
18-
int i, nbytes
19-
length = len(inbuff)
12+
cdef uint8_t control_byte
13+
cdef uint8_t [:] result = np.zeros(result_length, np.uint8)
14+
15+
cdef int rpos = 0
16+
cdef int ipos = 0
17+
cdef int i
18+
cdef int nbytes
19+
cdef uint8_t x
20+
cdef length = len(inbuff)
2021

2122
while ipos < length:
2223
control_byte = inbuff[ipos] & 0xF0
@@ -105,22 +106,24 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui
105106
if len(result) != result_length:
106107
print("RLE: %v != %v\n", (len(result), result_length))
107108

108-
return np.asarray(result, dtype=np.uint8)
109+
return np.asarray(result).tostring()
109110

110111

111112
# rdc_decompress decompresses data using the Ross Data Compression algorithm:
112113
#
113114
# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
114-
cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
115+
cdef rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
115116

116-
cdef:
117-
uint8_t cmd, ofs, cnt
118-
uint16_t ctrl_bits
119-
uint16_t ctrl_mask = 0
120-
int ipos = 0
121-
int rpos = 0
122-
int k
123-
np.ndarray[uint8_t, ndim=1] outbuff = np.zeros(result_length, dtype=np.uint8)
117+
cdef uint8_t cmd
118+
cdef uint16_t ctrl_bits
119+
cdef uint16_t ctrl_mask = 0
120+
cdef uint16_t ofs
121+
cdef uint16_t cnt
122+
cdef int ipos = 0
123+
cdef int rpos = 0
124+
cdef int k
125+
126+
cdef uint8_t [:] outbuff = np.zeros(result_length, dtype=np.uint8)
124127

125128
ii = -1
126129

@@ -187,10 +190,9 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui
187190
if len(outbuff) != result_length:
188191
raise ValueError("RDC: %v != %v\n", len(outbuff), result_length)
189192

190-
return np.asarray(outbuff, dtype=np.uint8)
191-
193+
return np.asarray(outbuff).tostring()
192194

193-
cdef np.ndarray[uint8_t, ndim=1] decompress(object parser, int row_length, uint8_t[:] page):
195+
cdef decompress(object parser, int row_length, page):
194196
page = np.frombuffer(page, dtype=np.uint8)
195197
if parser.compression == const.rle_compression:
196198
return rle_decompress(row_length, page)
@@ -210,7 +212,7 @@ def do_read(object parser, int nrows):
210212
break
211213

212214

213-
cdef bint readline(object parser):
215+
cdef readline(object parser):
214216

215217
cdef:
216218
int offset, bit_offset, align_correction, subheader_pointer_length
@@ -281,17 +283,17 @@ cdef bint readline(object parser):
281283
parser._current_page_type)
282284

283285

284-
cdef void process_byte_array_with_data(object parser, int offset, int length):
286+
cdef process_byte_array_with_data(object parser, int offset, int length):
285287

286288
cdef:
287289
int s, j, k, m, start, jb, js, lngt
288290
long[:] lengths = parser._column_data_lengths
289291
long[:] offsets = parser._column_data_offsets
290292
char[:] column_types = parser.column_types
291293
uint8_t[:, :] byte_chunk = parser._byte_chunk
292-
#object[:, :] string_chunk = parser._string_chunk
294+
object[:, :] string_chunk = parser._string_chunk
293295

294-
source = np.frombuffer(parser._cached_page[offset:offset+length], dtype=np.uint8)
296+
source = parser._cached_page[offset:offset+length]
295297
if (parser.compression != "") and (length < parser.row_length):
296298
source = decompress(parser, parser.row_length, source)
297299

@@ -312,7 +314,7 @@ cdef void process_byte_array_with_data(object parser, int offset, int length):
312314
byte_chunk[jb, m + k] = source[start + k]
313315
jb += 1
314316
elif column_types[j] == b's':
315-
parser._string_chunk[js][parser._current_row_in_chunk_index] = source[start:(start+lngt)].tostring().rstrip()
317+
string_chunk[js, parser._current_row_in_chunk_index] = source[start:(start+lngt)].rstrip()
316318
js += 1
317319
else:
318320
raise ValueError("unknown column type: %s" % parser.columns[j].ctype)

0 commit comments

Comments
 (0)