reimplement Vector cython class

jreback · jreback · commit f20c880a030f · 2015-05-28T18:06:15.000-04:00
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -4,6 +4,7 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
 
 from khash cimport *
 from numpy cimport *
+from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
 
 from util cimport _checknan
 cimport util
@@ -33,56 +34,26 @@ cdef extern from "Python.h":
     int PySlice_Check(object)
 
 cdef size_t _INIT_VEC_CAP = 32
-cdef size_t _USE_GIL = 1000000
-
-def list_to_object_array(list obj):
-    '''
-    Convert list to object ndarray. Seriously can't believe I had to write this
-    function
-    '''
-    cdef:
-        Py_ssize_t i, n
-        ndarray[object] arr
-
-    n = len(obj)
-    arr = np.empty(n, dtype=object)
-
-    for i from 0 <= i < n:
-        arr[i] = obj[i]
-
-    return arr
-
 
 cdef class Vector:
-
-    cdef:
-        size_t n, m
-        ndarray ao
-
-    def __len__(self):
-        return self.n
-
-    cdef inline uint8_t needs_resize(self) nogil:
-        # if we need to resize
-        return self.n == self.m
-
-    def to_array(self):
-        self.ao.resize(self.n)
-        self.m = self.n
-        return self.ao
-
+    pass
 
 cdef class ObjectVector(Vector):
 
     cdef:
         PyObject **data
+        size_t n, m
+        ndarray ao
 
     def __cinit__(self):
         self.n = 0
         self.m = _INIT_VEC_CAP
         self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
         self.data = <PyObject**> self.ao.data
 
+    def __len__(self):
+        return self.n
+
     cdef inline append(self, object o):
         if self.n == self.m:
             self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -93,64 +64,107 @@ cdef class ObjectVector(Vector):
         self.data[self.n] = <PyObject*> o
         self.n += 1
 
+    def to_array(self):
+        self.ao.resize(self.n)
+        self.m = self.n
+        return self.ao
+
+
+ctypedef struct Int64VectorData:
+    int64_t *data
+    size_t n, m
+
+cdef uint8_t Int64VectorData_needs_resize(Int64VectorData *data) nogil:
+    return data.n == data.m
+
+cdef void Int64VectorData_append(Int64VectorData *data, int64_t x) nogil:
 
-cdef class Int64Vector(Vector):
+    data.data[data.n] = x
+    data.n += 1
+
+cdef class Int64Vector:
 
     cdef:
-        int64_t *data
+        Int64VectorData *data
+        ndarray ao
 
-    def __cinit__(self, int64_t m = -1):
-        self.n = 0
-        self.m = _INIT_VEC_CAP if m == -1 else m
-        self.ao = np.empty(self.m, dtype=np.int64)
-        self.data = <int64_t*> self.ao.data
+    def __cinit__(self):
+        self.data = <Int64VectorData *>PyMem_Malloc(sizeof(Int64VectorData))
+        self.data.n = 0
+        self.data.m = _INIT_VEC_CAP
+        self.ao = np.empty(self.data.m, dtype=np.int64)
+        self.data.data = <int64_t*> self.ao.data
 
     cdef resize(self):
-        self.m = max(self.m * 4, _INIT_VEC_CAP)
-        self.ao.resize(self.m)
-        self.data = <int64_t*> self.ao.data
+        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+        self.ao.resize(self.data.m)
+        self.data.data = <int64_t*> self.ao.data
 
-    cdef inline void append_nogil(self, int64_t x) nogil:
+    def __dealloc__(self):
+        PyMem_Free(self.data)
 
-        if self.needs_resize():
-            with gil:
-                self.resize()
+    def __len__(self):
+        return self.data.n
 
-        self.data[self.n] = x
-        self.n += 1
+    def to_array(self):
+        self.ao.resize(self.data.n)
+        self.data.m = self.data.n
+        return self.ao
 
     cdef inline void append(self, int64_t x):
 
-        if self.needs_resize():
+        if Int64VectorData_needs_resize(self.data):
             self.resize()
 
-        self.data[self.n] = x
-        self.n += 1
+        Int64VectorData_append(self.data, x)
+
+ctypedef struct Float64VectorData:
+    float64_t *data
+    size_t n, m
+
+cdef uint8_t Float64VectorData_needs_resize(Float64VectorData *data) nogil:
+    return data.n == data.m
+
+cdef void Float64VectorData_append(Float64VectorData *data, float64_t x) nogil:
+
+    data.data[data.n] = x
+    data.n += 1
 
 cdef class Float64Vector(Vector):
 
     cdef:
-        float64_t *data
+        Float64VectorData *data
+        ndarray ao
 
     def __cinit__(self):
-        self.n = 0
-        self.m = _INIT_VEC_CAP
-        self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
-        self.data = <float64_t*> self.ao.data
+        self.data = <Float64VectorData *>PyMem_Malloc(sizeof(Float64VectorData))
+        self.data.n = 0
+        self.data.m = _INIT_VEC_CAP
+        self.ao = np.empty(self.data.m, dtype=np.float64)
+        self.data.data = <float64_t*> self.ao.data
 
     cdef resize(self):
-        self.m = max(self.m * 2, _INIT_VEC_CAP)
-        self.ao.resize(self.m)
-        self.data = <float64_t*> self.ao.data
+        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+        self.ao.resize(self.data.m)
+        self.data.data = <float64_t*> self.ao.data
 
-    cdef inline void append(self, float64_t x) nogil:
-        if self.needs_resize():
-            with gil:
-                self.resize()
+    def __dealloc__(self):
+        PyMem_Free(self.data)
 
-        self.data[self.n] = x
-        self.n += 1
+    def __len__(self):
+        return self.data.n
+
+    def to_array(self):
+        self.ao.resize(self.data.n)
+        self.data.m = self.data.n
+        return self.ao
 
+    cdef inline void append(self, float64_t x):
+
+        if Float64VectorData_needs_resize(self.data):
+            self.resize()
+
+        Float64VectorData_append(self.data, x)
 
 cdef class HashTable:
     pass
@@ -370,25 +384,12 @@ cdef class Int64HashTable(HashTable):
             int ret = 0
             int64_t val
             khiter_t k
+            Int64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
+        ud = uniques.data
 
-        if n > _USE_GIL:
-            with nogil:
-                for i in range(n):
-                    val = values[i]
-                    k = kh_get_int64(self.table, val)
-                    if k != self.table.n_buckets:
-                        idx = self.table.vals[k]
-                        labels[i] = idx
-                    else:
-                        k = kh_put_int64(self.table, val, &ret)
-                        self.table.vals[k] = count
-                        uniques.append_nogil(val)
-                        labels[i] = count
-                        count += 1
-
-        else:
+        with nogil:
             for i in range(n):
                 val = values[i]
                 k = kh_get_int64(self.table, val)
@@ -398,7 +399,11 @@ cdef class Int64HashTable(HashTable):
                 else:
                     k = kh_put_int64(self.table, val, &ret)
                     self.table.vals[k] = count
-                    uniques.append(val)
+
+                    if Int64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Int64VectorData_append(ud, val)
                     labels[i] = count
                     count += 1
 
@@ -414,8 +419,10 @@ cdef class Int64HashTable(HashTable):
             int64_t val
             khiter_t k
             Int64Vector uniques = Int64Vector()
+            Int64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
+        ud = uniques.data
 
         with nogil:
             for i in range(n):
@@ -433,7 +440,11 @@ cdef class Int64HashTable(HashTable):
                 else:
                     k = kh_put_int64(self.table, val, &ret)
                     self.table.vals[k] = count
-                    uniques.append_nogil(val)
+
+                    if Int64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Int64VectorData_append(ud, val)
                     labels[i] = count
                     count += 1
 
@@ -450,14 +461,21 @@ cdef class Int64HashTable(HashTable):
             int64_t val
             khiter_t k
             Int64Vector uniques = Int64Vector()
+            Int64VectorData *ud
+
+        ud = uniques.data
 
         with nogil:
             for i in range(n):
                 val = values[i]
                 k = kh_get_int64(self.table, val)
                 if k == self.table.n_buckets:
                     kh_put_int64(self.table, val, &ret)
-                    uniques.append_nogil(val)
+
+                    if Int64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Int64VectorData_append(ud, val)
 
         result = uniques.to_array()
 
@@ -518,8 +536,10 @@ cdef class Float64HashTable(HashTable):
             int ret = 0
             float64_t val
             khiter_t k
+            Float64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
+        ud = uniques.data
 
         with nogil:
             for i in range(n):
@@ -536,7 +556,11 @@ cdef class Float64HashTable(HashTable):
                 else:
                     k = kh_put_float64(self.table, val, &ret)
                     self.table.vals[k] = count
-                    uniques.append(val)
+
+                    if Float64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Float64VectorData_append(ud, val)
                     labels[i] = count
                     count += 1
 
@@ -581,8 +605,11 @@ cdef class Float64HashTable(HashTable):
             int ret = 0
             float64_t val
             khiter_t k
-            Float64Vector uniques = Float64Vector()
             bint seen_na = 0
+            Float64Vector uniques = Float64Vector()
+            Float64VectorData *ud
+
+        ud = uniques.data
 
         with nogil:
             for i in range(n):
@@ -592,10 +619,19 @@ cdef class Float64HashTable(HashTable):
                     k = kh_get_float64(self.table, val)
                     if k == self.table.n_buckets:
                         kh_put_float64(self.table, val, &ret)
-                        uniques.append(val)
+
+                        if Float64VectorData_needs_resize(ud):
+                            with gil:
+                                uniques.resize()
+                        Float64VectorData_append(ud, val)
+
                 elif not seen_na:
                     seen_na = 1
-                    uniques.append(NAN)
+
+                    if Float64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Float64VectorData_append(ud, NAN)
 
         return uniques.to_array()