add 'refcheck=True' as kwarg to functions that call resize, use refcheck=False where safe

mattip · mattip · commit 56233e45544f · 2017-03-18T23:07:41.000+02:00
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -53,7 +53,7 @@ cdef class Int64Vector:
     cdef Int64VectorData *data
     cdef ndarray ao
 
-    cdef resize(self)
-    cpdef to_array(self)
-    cdef inline void append(self, int64_t x)
+    cdef resize(self, refcheck=*)
+    cpdef to_array(self, refcheck=*)
+    cdef inline void append(self, int64_t x, refcheck=*)
     cdef extend(self, int64_t[:] x)
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -85,9 +85,9 @@ cdef class {{name}}Vector:
         self.ao = np.empty(self.data.m, dtype={{idtype}})
         self.data.data = <{{arg}}*> self.ao.data
 
-    cdef resize(self):
+    cdef resize(self, refcheck=True):
         self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
-        self.ao.resize(self.data.m)
+        self.ao.resize(self.data.m, refcheck=refcheck)
         self.data.data = <{{arg}}*> self.ao.data
 
     def __dealloc__(self):
@@ -98,15 +98,15 @@ cdef class {{name}}Vector:
     def __len__(self):
         return self.data.n
 
-    cpdef to_array(self):
-        self.ao.resize(self.data.n)
+    cpdef to_array(self, refcheck=True):
+        self.ao.resize(self.data.n, refcheck=refcheck)
         self.data.m = self.data.n
         return self.ao
 
-    cdef inline void append(self, {{arg}} x):
+    cdef inline void append(self, {{arg}} x, refcheck=True):
 
         if needs_resize(self.data):
-            self.resize()
+            self.resize(refcheck=refcheck)
 
         append_data_{{dtype}}(self.data, x)
 
@@ -130,11 +130,12 @@ cdef class StringVector:
         self.data.m = _INIT_VEC_CAP
         self.data.data = <char **> malloc(self.data.m * sizeof(char *))
 
-    cdef resize(self):
+    cdef resize(self, refcheck=True):
         cdef:
             char **orig_data
             size_t i, m
 
+        # refcheck ignored, for compatibility only
         m = self.data.m
         self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
 
@@ -154,23 +155,24 @@ cdef class StringVector:
     def __len__(self):
         return self.data.n
 
-    def to_array(self):
+    def to_array(self, refcheck=True):
         cdef:
             ndarray ao
             size_t n
             object val
-
+        
+        # refcheck is unused but needed for API compatibility
         ao = np.empty(self.data.n, dtype=np.object)
         for i in range(self.data.n):
             val = self.data.data[i]
             ao[i] = val
         self.data.m = self.data.n
         return ao
 
-    cdef inline void append(self, char * x):
+    cdef inline void append(self, char * x, refcheck=True):
 
         if needs_resize(self.data):
-            self.resize()
+            self.resize(refcheck=refcheck)
 
         append_data_string(self.data, x)
 
@@ -191,18 +193,18 @@ cdef class ObjectVector:
     def __len__(self):
         return self.n
 
-    cdef inline append(self, object o):
+    cdef inline append(self, object o, refcheck=True):
         if self.n == self.m:
             self.m = max(self.m * 2, _INIT_VEC_CAP)
-            self.ao.resize(self.m)
+            self.ao.resize(self.m, refcheck=refcheck)
             self.data = <PyObject**> self.ao.data
 
         Py_INCREF(o)
         self.data[self.n] = <PyObject*> o
         self.n += 1
 
-    def to_array(self):
-        self.ao.resize(self.n)
+    def to_array(self, refcheck=True):
+        self.ao.resize(self.n, refcheck=refcheck)
         self.m = self.n
         return self.ao
 
@@ -324,13 +326,13 @@ cdef class {{name}}HashTable(HashTable):
 
     def factorize(self, {{dtype}}_t values):
         uniques = {{name}}Vector()
-        labels = self.get_labels(values, uniques, 0, 0)
-        return uniques.to_array(), labels
+        labels = self.get_labels(values, uniques, 0, 0, refcheck=False)
+        return uniques.to_array(refcheck=False), labels
 
     @cython.boundscheck(False)
     def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   bint check_null=True):
+                   bint check_null=True, bint refcheck=True):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
@@ -362,7 +364,7 @@ cdef class {{name}}HashTable(HashTable):
 
                     if needs_resize(ud):
                         with gil:
-                            uniques.resize()
+                            uniques.resize(refcheck=refcheck)
                     append_data_{{dtype}}(ud, val)
                     labels[i] = count
                     count += 1
@@ -405,12 +407,12 @@ cdef class {{name}}HashTable(HashTable):
 
                     if needs_resize(ud):
                         with gil:
-                            uniques.resize()
+                            uniques.resize(refcheck=False)
                     append_data_{{dtype}}(ud, val)
                     labels[i] = count
                     count += 1
 
-        arr_uniques = uniques.to_array()
+        arr_uniques = uniques.to_array(refcheck=False)
 
         return np.asarray(labels), arr_uniques
 
@@ -438,25 +440,25 @@ cdef class {{name}}HashTable(HashTable):
                         kh_put_{{dtype}}(self.table, val, &ret)
                         if needs_resize(ud):
                             with gil:
-                                uniques.resize()
+                                uniques.resize(refcheck=False)
                         append_data_{{dtype}}(ud, val)
                 elif not seen_na:
                     seen_na = 1
                     if needs_resize(ud):
                         with gil:
-                            uniques.resize()
+                            uniques.resize(refcheck=False)
                     append_data_{{dtype}}(ud, NAN)
                 {{else}}
                 k = kh_get_{{dtype}}(self.table, val)
                 if k == self.table.n_buckets:
                     kh_put_{{dtype}}(self.table, val, &ret)
                     if needs_resize(ud):
                         with gil:
-                            uniques.resize()
+                            uniques.resize(refcheck=False)
                     append_data_{{dtype}}(ud, val)
                 {{endif}}
 
-        return uniques.to_array()
+        return uniques.to_array(refcheck=False)
 
 {{endfor}}
 
@@ -571,12 +573,12 @@ cdef class StringHashTable(HashTable):
         uniques = ObjectVector()
         for i in range(count):
             uniques.append(values[uindexer[i]])
-        return uniques.to_array()
+        return uniques.to_array(refcheck=False)
 
     def factorize(self, ndarray[object] values):
         uniques = ObjectVector()
-        labels = self.get_labels(values, uniques, 0, 0)
-        return uniques.to_array(), labels
+        labels = self.get_labels(values, uniques, 0, 0, refcheck=0)
+        return uniques.to_array(refcheck=False), labels
 
     @cython.boundscheck(False)
     def lookup(self, ndarray[object] values):
@@ -642,7 +644,7 @@ cdef class StringHashTable(HashTable):
     @cython.boundscheck(False)
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior, int64_t na_sentinel,
-                   bint check_null=1):
+                   bint check_null=1, bint refcheck=1):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
@@ -654,6 +656,7 @@ cdef class StringHashTable(HashTable):
             char **vecs
             khiter_t k
 
+        # refcheck is for compatibility
         # these by-definition *must* be strings
         labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
@@ -811,11 +814,11 @@ cdef class PyObjectHashTable(HashTable):
                 seen_na = 1
                 uniques.append(nan)
 
-        return uniques.to_array()
+        return uniques.to_array(refcheck=False)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior, int64_t na_sentinel,
-                   bint check_null=True):
+                   bint check_null=True, bint refcheck=True):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
@@ -968,5 +971,5 @@ cdef class MultiIndexHashTable(HashTable):
 
     def get_labels(self, object mi, ObjectVector uniques,
                    Py_ssize_t count_prior, int64_t na_sentinel,
-                   bint check_null=True):
+                   bint check_null=True, bint refcheck=True):
         raise NotImplementedError
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -349,11 +349,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     table = hash_klass(size_hint or len(vals))
     uniques = vec_klass()
     check_nulls = not is_integer_dtype(values)
-    labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls)
+    labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls, refcheck=False)
 
     labels = _ensure_platform_int(labels)
 
-    uniques = uniques.to_array()
+    uniques = uniques.to_array(refcheck=False)
 
     if sort and len(uniques) > 0:
         uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -289,7 +289,6 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
                                     "explicitly specify the categories order "
                                     "by passing in a categories argument.")
             except ValueError:
-
                 # FIXME
                 raise NotImplementedError("> 1 ndim Categorical are not "
                                           "supported at this time")
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -1410,7 +1410,7 @@ def _factorize_keys(lk, rk, sort=True):
     count = rizer.get_count()
 
     if sort:
-        uniques = rizer.uniques.to_array()
+        uniques = rizer.uniques.to_array(refcheck=False)
         llab, rlab = _sort_labels(uniques, llab, rlab)
 
     # NA group