Modest performance, address #12647

kshedden · kshedden · commit a7df84123f4a · 2016-04-21T19:32:43.000-04:00
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -327,12 +327,12 @@ def _get_properties(self):
                                _os_version_number_length)
         self.os_version = buf.rstrip(b'\x00 ').decode()
 
-        buf = self._read_bytes(
-            _os_name_offset, _os_name_length).rstrip(b'\x00 ')
+        buf = self._read_bytes(_os_name_offset, _os_name_length)
+        buf = buf.rstrip(b'\x00 ')
         if len(buf) > 0:
-            self.os_name = buf.rstrip(b'\x00 ').decode()
+            self.os_name = buf.decode()
         else:
-            buf = self._path_or_buf.read(_os_maker_offset, _os_maker_length)
+            buf = self._read_bytes(_os_maker_offset, _os_maker_length)
             self.os_name = buf.rstrip(b'\x00 ').decode()
 
     # Read a single float of the given width (4 or 8).
@@ -592,6 +592,10 @@ def _process_columnattributes_subheader(self, offset, length):
             length - 2 * int_len - 12) // (int_len + 8)
         self.column_types = np.empty(
             column_attributes_vectors_count, dtype=np.dtype('S1'))
+        self._column_data_lengths = np.empty(
+            column_attributes_vectors_count, dtype=np.int64)
+        self._column_data_offsets = np.empty(
+            column_attributes_vectors_count, dtype=np.int64)
         for i in range(column_attributes_vectors_count):
             col_data_offset = (offset + int_len +
                                _column_data_offset_offset + i * (int_len + 8))
@@ -600,11 +604,11 @@ def _process_columnattributes_subheader(self, offset, length):
             col_types = (offset + 2 * int_len +
                          _column_type_offset + i * (int_len + 8))
 
-            self._column_data_offsets.append(
-                self._read_int(col_data_offset, int_len))
+            x = self._read_int(col_data_offset, int_len)
+            self._column_data_offsets[i] = x
 
             x = self._read_int(col_data_len, _column_data_length_length)
-            self._column_data_lengths.append(x)
+            self._column_data_lengths[i] = x
 
             x = self._read_int(col_types, _column_type_length)
             if x == 1:
diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/saslib.pyx
@@ -1,6 +1,6 @@
 import numpy as np
 cimport numpy as np
-from numpy cimport uint8_t, uint16_t
+from numpy cimport uint8_t, uint16_t, int8_t
 
 # rle_decompress decompresses data using a Run Length Encoding
 # algorithm.  It is partially documented here:
@@ -191,43 +191,44 @@ def _rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
 
     return np.asarray(outbuff).tostring()
 
-def process_byte_array_with_data(parser, int offset, int length, np.ndarray[uint8_t, ndim=2] byte_chunk,
-                                 np.ndarray[dtype=object, ndim=2] string_chunk):
+def process_byte_array_with_data(parser, int offset, int length, uint8_t[:, ::1] byte_chunk,
+                                 object[:, ::1] string_chunk):
 
     cdef int s
     cdef int j
+    cdef int k
     cdef int m
     cdef int start
-    cdef int end
-    cdef bytes source
-    cdef bytes temp
     cdef int jb
     cdef int js
+    cdef int lngt
+
+    cdef long[:] lengths = parser._column_data_lengths
+    cdef long[:] offsets = parser._column_data_offsets
+    cdef char[:] column_types = parser.column_types
 
+    source = parser._cached_page[offset:offset+length]
     if (parser.compression != "") and (length < parser.row_length):
-        source = parser._decompress(parser.row_length, parser._cached_page[offset:offset + length])
-    else:
-        source = parser._cached_page[offset:offset + length]
+        source = parser._decompress(parser.row_length, source)
 
     s = 8 * parser._current_row_in_chunk_index
     js = 0
     jb = 0
     for j in range(parser.column_count):
-        length = parser._column_data_lengths[j]
-        if length == 0:
+        lngt = lengths[j]
+        if lngt == 0:
             break
-        start = parser._column_data_offsets[j]
-        end = start + length
-        temp = source[start:end]
-        if parser.column_types[j] == b'd':
-            m = 8 - length
+        start = offsets[j]
+        if column_types[j] == b'd':
             if parser.byte_order == "<":
-                byte_chunk[jb, s+m:s+8] = np.frombuffer(temp, dtype=np.uint8)
+                m = s + 8 - lngt
             else:
-                byte_chunk[jb, s:s+length] = np.frombuffer(temp, dtype=np.uint8)
+                m = s
+            for k in range(lngt):
+                byte_chunk[jb, m + k] = source[start + k]
             jb += 1
-        elif parser.column_types[j] == b's':
-            string_chunk[js, parser._current_row_in_chunk_index] = bytes(temp)
+        elif column_types[j] == b's':
+            string_chunk[js, parser._current_row_in_chunk_index] = bytes(source[start:start+lngt])
             js += 1
         else:
             raise ValueError("unknown column type: %s" % parser.columns[j].ctype)