Skip to content

Commit 7e156b7

Browse files
committed
Working on cython issues
1 parent dc330c5 commit 7e156b7

File tree

2 files changed

+59
-73
lines changed

2 files changed

+59
-73
lines changed

pandas/io/sas/sas7bdat.py

+8-17
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import numpy as np
2121
import struct
2222
import pandas.io.sas.sas_constants as const
23-
from .saslib import (_rle_decompress, _rdc_decompress, _do_read)
23+
from pandas.io.sas.saslib import do_read
2424

2525

2626
class _subheader_pointer(object):
@@ -550,11 +550,14 @@ def read(self, nrows=None):
550550
nd = (self.column_types == b'd').sum()
551551
ns = (self.column_types == b's').sum()
552552

553-
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
553+
self._string_chunk = []
554+
for j,ct in enumerate(self.column_types):
555+
if ct == b's':
556+
self._string_chunk.append([None] * nrows)
554557
self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8)
555558

556559
self._current_row_in_chunk_index = 0
557-
_do_read(self, nrows)
560+
do_read(self, nrows)
558561

559562
rslt = self._chunk_to_dataframe()
560563
if self.index is not None:
@@ -583,16 +586,6 @@ def _read_next_page(self):
583586

584587
return False
585588

586-
def _decompress(self, row_length, page):
587-
page = np.frombuffer(page, dtype=np.uint8)
588-
if self.compression == const.rle_compression:
589-
return _rle_decompress(row_length, page)
590-
elif self.compression == const.rdc_compression:
591-
return _rdc_decompress(row_length, page)
592-
else:
593-
raise ValueError("unknown SAS compression method: %s" %
594-
self.compression)
595-
596589
def _chunk_to_dataframe(self):
597590

598591
n = self._current_row_in_chunk_index
@@ -614,11 +607,9 @@ def _chunk_to_dataframe(self):
614607
rslt[name] = epoch + pd.to_timedelta(rslt[name], unit='d')
615608
jb += 1
616609
elif self.column_types[j] == b's':
617-
rslt[name] = self._string_chunk[js, :]
618-
rslt[name] = rslt[name].apply(lambda x: x.rstrip(b'\x00 '))
610+
rslt[name] = pd.Series(self._string_chunk[js], dtype=np.object)
619611
if self.convert_text and (self.encoding is not None):
620-
rslt[name] = rslt[name].apply(
621-
lambda x: x.decode(encoding=self.encoding))
612+
rslt[name] = rslt[name].str.decode(self.encoding)
622613
if self.blank_missing:
623614
ii = rslt[name].str.len() == 0
624615
rslt.loc[ii, name] = np.nan

pandas/io/sas/saslib.pyx

+51-56
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,15 @@ import sas_constants as const
88
# algorithm. It is partially documented here:
99
#
1010
# https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
11-
def _rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
11+
cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
1212

13-
cdef uint8_t control_byte
14-
cdef uint8_t [:] result = np.zeros(result_length, np.uint8)
15-
16-
cdef int rpos = 0
17-
cdef int ipos = 0
18-
cdef int i
19-
cdef int nbytes
20-
cdef uint8_t x
21-
cdef length = len(inbuff)
13+
cdef:
14+
uint8_t control_byte, x
15+
np.ndarray[uint8_t, ndim=1] result = np.zeros(result_length, np.uint8)
16+
int rpos = 0
17+
int ipos = 0
18+
int i, nbytes
19+
length = len(inbuff)
2220

2321
while ipos < length:
2422
control_byte = inbuff[ipos] & 0xF0
@@ -107,24 +105,22 @@ def _rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
107105
if len(result) != result_length:
108106
print("RLE: %v != %v\n", (len(result), result_length))
109107

110-
return np.asarray(result).tostring()
108+
return np.asarray(result, dtype=np.uint8)
111109

112110

113111
# rdc_decompress decompresses data using the Ross Data Compression algorithm:
114112
#
115113
# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
116-
def _rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
117-
118-
cdef uint8_t cmd
119-
cdef uint16_t ctrl_bits
120-
cdef uint16_t ctrl_mask = 0
121-
cdef uint16_t ofs
122-
cdef uint16_t cnt
123-
cdef int ipos = 0
124-
cdef int rpos = 0
125-
cdef int k
114+
cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
126115

127-
cdef uint8_t [:] outbuff = np.zeros(result_length, dtype=np.uint8)
116+
cdef:
117+
uint8_t cmd, ofs, cnt
118+
uint16_t ctrl_bits
119+
uint16_t ctrl_mask = 0
120+
int ipos = 0
121+
int rpos = 0
122+
int k
123+
np.ndarray[uint8_t, ndim=1] outbuff = np.zeros(result_length, dtype=np.uint8)
128124

129125
ii = -1
130126

@@ -191,24 +187,33 @@ def _rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
191187
if len(outbuff) != result_length:
192188
raise ValueError("RDC: %v != %v\n", len(outbuff), result_length)
193189

194-
return np.asarray(outbuff).tostring()
190+
return np.asarray(outbuff, dtype=np.uint8)
195191

196192

197-
def _do_read(parser, int nrows):
193+
cdef np.ndarray[uint8_t, ndim=1] decompress(object parser, int row_length, uint8_t[:] page):
194+
page = np.frombuffer(page, dtype=np.uint8)
195+
if parser.compression == const.rle_compression:
196+
return rle_decompress(row_length, page)
197+
elif parser.compression == const.rdc_compression:
198+
return rdc_decompress(row_length, page)
199+
else:
200+
raise ValueError("unknown SAS compression method: %s" %
201+
parser.compression)
202+
203+
204+
def do_read(object parser, int nrows):
198205
cdef int i
199206

200207
for i in range(nrows):
201-
done = _readline(parser)
208+
done = readline(parser)
202209
if done:
203210
break
204211

205212

206-
def _readline(parser):
213+
cdef bint readline(object parser):
207214

208-
cdef int offset
209-
cdef int bit_offset
210-
cdef int align_correction
211-
cdef int subheader_pointer_length
215+
cdef:
216+
int offset, bit_offset, align_correction, subheader_pointer_length
212217

213218
bit_offset = parser._page_bit_offset
214219
subheader_pointer_length = parser._subheader_pointer_length
@@ -236,9 +241,7 @@ def _readline(parser):
236241
parser._current_row_on_page_index])
237242
process_byte_array_with_data(parser,
238243
current_subheader_pointer.offset,
239-
current_subheader_pointer.length,
240-
parser._byte_chunk,
241-
parser._string_chunk)
244+
current_subheader_pointer.length)
242245
return False
243246
elif parser._current_page_type in const.page_mix_types:
244247
align_correction = (bit_offset + const.subheader_pointers_offset +
@@ -250,9 +253,7 @@ def _readline(parser):
250253
offset += (parser._current_page_subheaders_count *
251254
subheader_pointer_length)
252255
offset += parser._current_row_on_page_index * parser.row_length
253-
process_byte_array_with_data(parser, offset, parser.row_length,
254-
parser._byte_chunk,
255-
parser._string_chunk)
256+
process_byte_array_with_data(parser, offset, parser.row_length)
256257
mn = min(parser.row_count, parser._mix_page_row_count)
257258
if parser._current_row_on_page_index == mn:
258259
done = parser._read_next_page()
@@ -266,8 +267,7 @@ def _readline(parser):
266267
const.subheader_pointers_offset +
267268
parser._current_row_on_page_index *
268269
parser.row_length,
269-
parser.row_length, parser._byte_chunk,
270-
parser._string_chunk)
270+
parser.row_length)
271271
flag = (parser._current_row_on_page_index ==
272272
parser._current_page_block_count)
273273
if flag:
@@ -281,25 +281,20 @@ def _readline(parser):
281281
parser._current_page_type)
282282

283283

284-
def process_byte_array_with_data(parser, int offset, int length, uint8_t[:, ::1] byte_chunk,
285-
object[:, ::1] string_chunk):
286-
287-
cdef int s
288-
cdef int j
289-
cdef int k
290-
cdef int m
291-
cdef int start
292-
cdef int jb
293-
cdef int js
294-
cdef int lngt
284+
cdef void process_byte_array_with_data(object parser, int offset, int length):
295285

296-
cdef long[:] lengths = parser._column_data_lengths
297-
cdef long[:] offsets = parser._column_data_offsets
298-
cdef char[:] column_types = parser.column_types
286+
cdef:
287+
int s, j, k, m, start, jb, js, lngt
288+
long[:] lengths = parser._column_data_lengths
289+
long[:] offsets = parser._column_data_offsets
290+
char[:] column_types = parser.column_types
291+
uint8_t[:, :] byte_chunk = parser._byte_chunk
292+
#object[:, :] string_chunk = parser._string_chunk
299293

300-
source = parser._cached_page[offset:offset+length]
294+
source = np.frombuffer(parser._cached_page[offset:offset+length], dtype=np.uint8)
301295
if (parser.compression != "") and (length < parser.row_length):
302-
source = parser._decompress(parser.row_length, source)
296+
source = decompress(parser, parser.row_length, source)
297+
return
303298

304299
s = 8 * parser._current_row_in_chunk_index
305300
js = 0
@@ -318,10 +313,10 @@ def process_byte_array_with_data(parser, int offset, int length, uint8_t[:, ::1]
318313
byte_chunk[jb, m + k] = source[start + k]
319314
jb += 1
320315
elif column_types[j] == b's':
321-
string_chunk[js, parser._current_row_in_chunk_index] = bytes(source[start:start+lngt])
316+
parser._string_chunk[js][parser._current_row_in_chunk_index] = source[start:(start+lngt)].tostring().rstrip()
322317
js += 1
323318
else:
324-
raise ValueError("unknown column type: %s" % parser.columns[j].ctype)
319+
raise ValueError("unknown column type: %s" % parser.columns[j].ctype)
325320

326321
parser._current_row_on_page_index += 1
327322
parser._current_row_in_chunk_index += 1

0 commit comments

Comments
 (0)