diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 29ace4a339ced..a5a3edad63403 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -183,7 +183,7 @@ cdef class Int64Vector(Vector): cdef Int64VectorData data cdef ndarray ao - cdef resize(self) + cdef resize(self, Py_ssize_t new_size) cpdef ndarray to_array(self) cdef void append(self, int64_t x) noexcept cdef extend(self, int64_t[:] x) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 8250d0242c31f..070533ba999c7 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -7,6 +7,7 @@ from libc.stdlib cimport ( free, malloc, ) +from libc.string cimport memcpy import numpy as np diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f37a32ed61555..f9abd574dae01 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -163,8 +163,9 @@ ctypedef fused vector_data: Complex64VectorData StringVectorData -cdef bint needs_resize(vector_data *data) noexcept nogil: - return data.size == data.capacity + +cdef bint needs_resize(Py_ssize_t nelems, Py_ssize_t capacity) noexcept nogil: + return nelems >= capacity # ---------------------------------------------------------------------- # Vector @@ -214,8 +215,8 @@ cdef class {{name}}Vector(Vector): self.ao = np.empty(self.data.capacity, dtype=np.{{dtype}}) self.data.data = <{{c_type}}*>self.ao.data - cdef resize(self): - self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP) + cdef resize(self, Py_ssize_t new_size): + self.data.capacity = max(new_size, _INIT_VEC_CAP) self.ao.resize(self.data.capacity, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data @@ -234,17 +235,28 @@ cdef class {{name}}Vector(Vector): cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(&self.data): + if needs_resize(self.data.size, self.data.capacity): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.resize() + self.resize(self.data.capacity * 4) append_data_{{dtype}}(&self.data, x) cdef extend(self, const {{c_type}}[:] x): - for i in range(len(x)): - self.append(x[i]) + cdef Py_ssize_t x_size = len(x) + if x_size == 0: + return + + cdef Py_ssize_t needed_size = self.data.size + x_size + if needs_resize(needed_size, self.data.capacity): + if self.external_view_exists: + raise ValueError("external reference but " + "Vector.resize() needed") + self.resize(needed_size) + + memcpy(self.data.data + self.data.size, &x[0], x_size * sizeof({{c_type}})) + self.data.size = needed_size {{endfor}} @@ -260,7 +272,7 @@ cdef class StringVector(Vector): if self.data.data is NULL: raise MemoryError() - cdef resize(self): + cdef resize(self, Py_ssize_t new_size): cdef: char **orig_data Py_ssize_t i, orig_capacity @@ -297,8 +309,8 @@ cdef class StringVector(Vector): cdef void append(self, char *x) noexcept: - if needs_resize(&self.data): - self.resize() + if needs_resize(self.data.size, self.data.capacity): + self.resize(self.data.capacity * 4) append_data_string(&self.data, x) @@ -684,18 +696,18 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud): + if needs_resize(ud.size, ud.capacity): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") - uniques.resize() + uniques.resize(uniques.data.capacity * 4) if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") - result_mask.resize() + result_mask.resize(result_mask.data.capacity * 4) append_data_{{dtype}}(ud, val) append_data_uint8(rmd, 1) continue @@ -706,19 +718,19 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): + if needs_resize(ud.size, ud.capacity): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") - uniques.resize() + uniques.resize(uniques.data.capacity * 4) if use_result_mask: if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") - result_mask.resize() + result_mask.resize(result_mask.data.capacity * 4) append_data_{{dtype}}(ud, val) if use_result_mask: append_data_uint8(rmd, 0) @@ -849,9 +861,9 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud): + if needs_resize(ud.size, ud.capacity): with gil: - uniques.resize() + uniques.resize(uniques.data.capacity * 4) append_data_{{dtype}}(ud, val) labels[i] = count count += 1 diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index ca1b28b9442ca..5500fadb73b6d 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -480,9 +480,9 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(ud): + if needs_resize(ud.size, ud.capacity): with gil: - idx.resize() + idx.resize(idx.data.capacity * 4) append_data_{{ttype}}(ud, i) kh_destroy_{{ttype}}(table)