diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 9b352ae1c003b..5b4e6e99e5e93 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -52,8 +52,9 @@ cdef struct Int64VectorData: cdef class Int64Vector: cdef Int64VectorData *data cdef ndarray ao + cdef bint external_view_exists - cdef resize(self) + cdef void resize(self) nogil cpdef to_array(self) cdef inline void append(self, int64_t x) cdef extend(self, int64_t[:] x) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c8aedcef77502..d7e7c017f04e9 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -5,12 +5,12 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check from khash cimport * from numpy cimport * -from libc.stdlib cimport malloc, free +from libc.stdlib cimport malloc, free, realloc from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free, PyString_Check, PyBytes_Check, PyUnicode_Check) -from util cimport _checknan +from util cimport _checknan, set_array_owndata cimport util import numpy as np diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ce82dace40a9..cfacfc3be6e49 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -58,10 +58,10 @@ cdef inline bint needs_resize(vector_data *data) nogil: {{py: -# name, dtype, arg, idtype -dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), - ('UInt64', 'uint64', 'uint64_t', 'np.uint64'), - ('Int64', 'int64', 'int64_t', 'np.int64')] +# name, dtype, arg, type_id +dtypes = [('Float64', 'float64', 'float64_t', 'cnp.NPY_FLOAT64'), + ('UInt64', 'uint64', 'uint64_t', 'cnp.NPY_UINT64'), + ('Int64', 'int64', 'int64_t', 'cnp.NPY_INT64')] }} @@ -71,6 +71,7 @@ cdef class {{name}}Vector: {{if dtype != 'int64'}} cdef: + bint external_view_exists {{name}}VectorData *data ndarray ao {{endif}} @@ -80,18 +81,21 @@ cdef class {{name}}Vector: sizeof({{name}}VectorData)) if not self.data: raise MemoryError() + self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype={{idtype}}) - self.data.data = <{{arg}}*> self.ao.data + self.data.data = <{{arg}}*> malloc(self.data.m * sizeof({{arg}})) - cdef resize(self): + cdef void resize(self) nogil: + # TODO: handle failure to allocate self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = <{{arg}}*> self.ao.data + self.data.data = <{{arg}}*> realloc(self.data.data, self.data.m * sizeof({{arg}})) def __dealloc__(self): if self.data is not NULL: + if not self.external_view_exists and self.data.data: + # buffer was never returned as array, so free + free(self.data.data) PyMem_Free(self.data) self.data = NULL @@ -99,9 +103,19 @@ cdef class {{name}}Vector: return self.data.n cpdef to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao + cdef: + ndarray ao + cnp.npy_intp shape[1] + if self.external_view_exists: + raise ValueError("Vector.to_array() can only be called once") + + self.data.data = <{{arg}}*> realloc(self.data.data, self.data.n * sizeof({{arg}})) + self.external_view_exists = True + shape[0] = self.data.n + ao = cnp.PyArray_SimpleNewFromData(1, shape, {{idtype}}, self.data.data) + # ownership transfer so numpy eventually frees + set_array_owndata(ao) + return ao cdef inline void append(self, {{arg}} x): @@ -120,32 +134,28 @@ cdef class StringVector: cdef: StringVectorData *data + bint external_view_exists def __cinit__(self): self.data = PyMem_Malloc( sizeof(StringVectorData)) if not self.data: raise MemoryError() + self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) - cdef resize(self): - cdef: - char **orig_data - size_t i, m - - m = self.data.m + cdef void resize(self) nogil: self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - # TODO: can resize? - orig_data = self.data.data - self.data.data = malloc(self.data.m * sizeof(char *)) - for i in range(m): - self.data.data[i] = orig_data[i] + self.data.data = realloc(self.data.data, + self.data.m * sizeof(char *)) def __dealloc__(self): if self.data is not NULL: + # unlike numeric vectors, strings were copied + # into python objects so always free if self.data.data is not NULL: free(self.data.data) PyMem_Free(self.data) @@ -159,12 +169,15 @@ cdef class StringVector: ndarray ao size_t n object val + if self.external_view_exists: + raise ValueError("Vector.to_array() can only be called once") ao = np.empty(self.data.n, dtype=np.object) for i in range(self.data.n): val = self.data.data[i] ao[i] = val self.data.m = self.data.n + self.external_view_exists = True return ao cdef inline void append(self, char * x): @@ -181,8 +194,10 @@ cdef class ObjectVector: PyObject **data size_t n, m ndarray ao + bint external_view_exists def __cinit__(self): + self.external_view_exists = False self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) @@ -194,7 +209,7 @@ cdef class ObjectVector: cdef inline append(self, object o): if self.n == self.m: self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m) + self.ao.resize(self.m, refcheck=False) self.data = self.ao.data Py_INCREF(o) @@ -202,8 +217,11 @@ cdef class ObjectVector: self.n += 1 def to_array(self): + if self.external_view_exists: + raise ValueError("Vector.to_array() can only be called once") self.ao.resize(self.n) self.m = self.n + self.external_view_exists = True return self.ao @@ -361,8 +379,7 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = count if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, val) labels[i] = count count += 1 @@ -404,8 +421,7 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = count if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, val) labels[i] = count count += 1 @@ -437,22 +453,19 @@ cdef class {{name}}HashTable(HashTable): if k == self.table.n_buckets: kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, val) elif not seen_na: seen_na = 1 if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, NAN) {{else}} k = kh_get_{{dtype}}(self.table, val) if k == self.table.n_buckets: kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, val) {{endif}}