pandas-dev
diff --git a/‎pandas/core/algorithms.py
+1-1 b/‎pandas/core/algorithms.py
+1-1
diff --git a/‎pandas/hashtable.pyx
+132-95 b/‎pandas/hashtable.pyx
+132-95
diff --git a/‎pandas/index.pyx
+3-3 b/‎pandas/index.pyx
+3-3
@@ -131,7 +131,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
 
     table = hash_klass(size_hint or len(vals))
-    uniques = vec_klass(len(vals))
+    uniques = vec_klass()
     labels = table.get_labels(vals, uniques, 0, na_sentinel)
 
     labels = com._ensure_platform_int(labels)
 
@@ -4,6 +4,7 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
 
 from khash cimport *
 from numpy cimport *
+from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
 
 from util cimport _checknan
 cimport util
@@ -33,56 +34,23 @@ cdef extern from "Python.h":
     int PySlice_Check(object)
 
 cdef size_t _INIT_VEC_CAP = 32
-cdef size_t _USE_GIL = 1000000
-
-def list_to_object_array(list obj):
-    '''
-    Convert list to object ndarray. Seriously can't believe I had to write this
-    function
-    '''
-    cdef:
-        Py_ssize_t i, n
-        ndarray[object] arr
-
-    n = len(obj)
-    arr = np.empty(n, dtype=object)
-
-    for i from 0 <= i < n:
-        arr[i] = obj[i]
-
-    return arr
 
-
-cdef class Vector:
+cdef class ObjectVector:
 
     cdef:
+        PyObject **data
         size_t n, m
         ndarray ao
 
-    def __len__(self):
-        return self.n
-
-    cdef inline uint8_t needs_resize(self) nogil:
-        # if we need to resize
-        return self.n == self.m
-
-    def to_array(self):
-        self.ao.resize(self.n)
-        self.m = self.n
-        return self.ao
-
-
-cdef class ObjectVector(Vector):
-
-    cdef:
-        PyObject **data
-
     def __cinit__(self):
         self.n = 0
         self.m = _INIT_VEC_CAP
         self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
         self.data = <PyObject**> self.ao.data
 
+    def __len__(self):
+        return self.n
+
     cdef inline append(self, object o):
         if self.n == self.m:
             self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -93,64 +61,111 @@ cdef class ObjectVector(Vector):
         self.data[self.n] = <PyObject*> o
         self.n += 1
 
+    def to_array(self):
+        self.ao.resize(self.n)
+        self.m = self.n
+        return self.ao
+
+
+ctypedef struct Int64VectorData:
+    int64_t *data
+    size_t n, m
+
+cdef uint8_t Int64VectorData_needs_resize(Int64VectorData *data) nogil:
+    return data.n == data.m
+
+cdef void Int64VectorData_append(Int64VectorData *data, int64_t x) nogil:
+
+    data.data[data.n] = x
+    data.n += 1
 
-cdef class Int64Vector(Vector):
+cdef class Int64Vector:
 
     cdef:
-        int64_t *data
+        Int64VectorData *data
+        ndarray ao
 
-    def __cinit__(self, int64_t m = -1):
-        self.n = 0
-        self.m = _INIT_VEC_CAP if m == -1 else m
-        self.ao = np.empty(self.m, dtype=np.int64)
-        self.data = <int64_t*> self.ao.data
+    def __cinit__(self):
+        self.data = <Int64VectorData *>PyMem_Malloc(sizeof(Int64VectorData))
+        if not self.data:
+            raise MemoryError()
+        self.data.n = 0
+        self.data.m = _INIT_VEC_CAP
+        self.ao = np.empty(self.data.m, dtype=np.int64)
+        self.data.data = <int64_t*> self.ao.data
 
     cdef resize(self):
-        self.m = max(self.m * 4, _INIT_VEC_CAP)
-        self.ao.resize(self.m)
-        self.data = <int64_t*> self.ao.data
+        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+        self.ao.resize(self.data.m)
+        self.data.data = <int64_t*> self.ao.data
 
-    cdef inline void append_nogil(self, int64_t x) nogil:
+    def __dealloc__(self):
+        PyMem_Free(self.data)
 
-        if self.needs_resize():
-            with gil:
-                self.resize()
+    def __len__(self):
+        return self.data.n
 
-        self.data[self.n] = x
-        self.n += 1
+    def to_array(self):
+        self.ao.resize(self.data.n)
+        self.data.m = self.data.n
+        return self.ao
 
     cdef inline void append(self, int64_t x):
 
-        if self.needs_resize():
+        if Int64VectorData_needs_resize(self.data):
             self.resize()
 
-        self.data[self.n] = x
-        self.n += 1
+        Int64VectorData_append(self.data, x)
 
-cdef class Float64Vector(Vector):
+ctypedef struct Float64VectorData:
+    float64_t *data
+    size_t n, m
+
+cdef uint8_t Float64VectorData_needs_resize(Float64VectorData *data) nogil:
+    return data.n == data.m
+
+cdef void Float64VectorData_append(Float64VectorData *data, float64_t x) nogil:
+
+    data.data[data.n] = x
+    data.n += 1
+
+cdef class Float64Vector:
 
     cdef:
-        float64_t *data
+        Float64VectorData *data
+        ndarray ao
 
     def __cinit__(self):
-        self.n = 0
-        self.m = _INIT_VEC_CAP
-        self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
-        self.data = <float64_t*> self.ao.data
+        self.data = <Float64VectorData *>PyMem_Malloc(sizeof(Float64VectorData))
+        if not self.data:
+            raise MemoryError()
+        self.data.n = 0
+        self.data.m = _INIT_VEC_CAP
+        self.ao = np.empty(self.data.m, dtype=np.float64)
+        self.data.data = <float64_t*> self.ao.data
 
     cdef resize(self):
-        self.m = max(self.m * 2, _INIT_VEC_CAP)
-        self.ao.resize(self.m)
-        self.data = <float64_t*> self.ao.data
+        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+        self.ao.resize(self.data.m)
+        self.data.data = <float64_t*> self.ao.data
 
-    cdef inline void append(self, float64_t x) nogil:
-        if self.needs_resize():
-            with gil:
-                self.resize()
+    def __dealloc__(self):
+        PyMem_Free(self.data)
 
-        self.data[self.n] = x
-        self.n += 1
+    def __len__(self):
+        return self.data.n
 
+    def to_array(self):
+        self.ao.resize(self.data.n)
+        self.data.m = self.data.n
+        return self.ao
+
+    cdef inline void append(self, float64_t x):
+
+        if Float64VectorData_needs_resize(self.data):
+            self.resize()
+
+        Float64VectorData_append(self.data, x)
 
 cdef class HashTable:
     pass
@@ -370,25 +385,12 @@ cdef class Int64HashTable(HashTable):
             int ret = 0
             int64_t val
             khiter_t k
+            Int64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
+        ud = uniques.data
 
-        if n > _USE_GIL:
-            with nogil:
-                for i in range(n):
-                    val = values[i]
-                    k = kh_get_int64(self.table, val)
-                    if k != self.table.n_buckets:
-                        idx = self.table.vals[k]
-                        labels[i] = idx
-                    else:
-                        k = kh_put_int64(self.table, val, &ret)
-                        self.table.vals[k] = count
-                        uniques.append_nogil(val)
-                        labels[i] = count
-                        count += 1
-
-        else:
+        with nogil:
             for i in range(n):
                 val = values[i]
                 k = kh_get_int64(self.table, val)
@@ -398,7 +400,11 @@ cdef class Int64HashTable(HashTable):
                 else:
                     k = kh_put_int64(self.table, val, &ret)
                     self.table.vals[k] = count
-                    uniques.append(val)
+
+                    if Int64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Int64VectorData_append(ud, val)
                     labels[i] = count
                     count += 1
 
@@ -414,8 +420,10 @@ cdef class Int64HashTable(HashTable):
             int64_t val
             khiter_t k
             Int64Vector uniques = Int64Vector()
+            Int64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
+        ud = uniques.data
 
         with nogil:
             for i in range(n):
@@ -433,7 +441,11 @@ cdef class Int64HashTable(HashTable):
                 else:
                     k = kh_put_int64(self.table, val, &ret)
                     self.table.vals[k] = count
-                    uniques.append_nogil(val)
+
+                    if Int64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Int64VectorData_append(ud, val)
                     labels[i] = count
                     count += 1
 
@@ -450,14 +462,21 @@ cdef class Int64HashTable(HashTable):
             int64_t val
             khiter_t k
             Int64Vector uniques = Int64Vector()
+            Int64VectorData *ud
+
+        ud = uniques.data
 
         with nogil:
             for i in range(n):
                 val = values[i]
                 k = kh_get_int64(self.table, val)
                 if k == self.table.n_buckets:
                     kh_put_int64(self.table, val, &ret)
-                    uniques.append_nogil(val)
+
+                    if Int64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Int64VectorData_append(ud, val)
 
         result = uniques.to_array()
 
@@ -518,8 +537,10 @@ cdef class Float64HashTable(HashTable):
             int ret = 0
             float64_t val
             khiter_t k
+            Float64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
+        ud = uniques.data
 
         with nogil:
             for i in range(n):
@@ -536,7 +557,11 @@ cdef class Float64HashTable(HashTable):
                 else:
                     k = kh_put_float64(self.table, val, &ret)
                     self.table.vals[k] = count
-                    uniques.append(val)
+
+                    if Float64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Float64VectorData_append(ud, val)
                     labels[i] = count
                     count += 1
 
@@ -581,8 +606,11 @@ cdef class Float64HashTable(HashTable):
             int ret = 0
             float64_t val
             khiter_t k
-            Float64Vector uniques = Float64Vector()
             bint seen_na = 0
+            Float64Vector uniques = Float64Vector()
+            Float64VectorData *ud
+
+        ud = uniques.data
 
         with nogil:
             for i in range(n):
@@ -592,10 +620,19 @@ cdef class Float64HashTable(HashTable):
                     k = kh_get_float64(self.table, val)
                     if k == self.table.n_buckets:
                         kh_put_float64(self.table, val, &ret)
-                        uniques.append(val)
+
+                        if Float64VectorData_needs_resize(ud):
+                            with gil:
+                                uniques.resize()
+                        Float64VectorData_append(ud, val)
+
                 elif not seen_na:
                     seen_na = 1
-                    uniques.append(NAN)
+
+                    if Float64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Float64VectorData_append(ud, NAN)
 
         return uniques.to_array()
 
 
@@ -233,14 +233,13 @@ cdef class IndexEngine:
     cdef inline _do_monotonic_check(self):
         try:
             values = self._get_index_values()
-            self.monotonic_inc, self.monotonic_dec, self.unique = \
+            self.monotonic_inc, self.monotonic_dec = \
                 self._call_monotonic(values)
         except TypeError:
             self.monotonic_inc = 0
             self.monotonic_dec = 0
 
         self.monotonic_check = 1
-        self.unique_check = 1
 
     cdef _get_index_values(self):
         return self.vgetter()
@@ -269,7 +268,8 @@ cdef class IndexEngine:
 
         if len(self.mapping) == len(values):
             self.unique = 1
-        self.unique_check = 1
+            self.unique_check = 1
+
         self.initialized = 1
 
     def clear_mapping(self):