From fe1cfc82d6ef6e8d9aa23264c2e02fd2182150bc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 14:43:07 -0800 Subject: [PATCH 01/15] Hashtable cleanups --- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 46 +++++++--------------- pandas/_libs/hashtable_func_helper.pxi.in | 6 +-- 3 files changed, 19 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index eaec9e8462450..22b923580c491 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -180,7 +180,7 @@ cdef class Vector: cdef bint external_view_exists cdef class Int64Vector(Vector): - cdef Int64VectorData *data + cdef Int64VectorData data cdef ndarray ao cdef resize(self) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 26dcf0b6c4ce3..f1a0e61a57056 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -163,8 +163,8 @@ ctypedef fused vector_data: Complex64VectorData StringVectorData -cdef bint needs_resize(vector_data *data) noexcept nogil: - return data.n == data.m +cdef bint needs_resize(size_t size, size_t capacity) noexcept nogil: + return size == capacity # ---------------------------------------------------------------------- # Vector @@ -204,15 +204,11 @@ cdef class {{name}}Vector(Vector): # Int64Vector is the only one we need exposed for other cython files. {{if dtype != 'int64'}} cdef: - {{name}}VectorData *data + {{name}}VectorData data ndarray ao {{endif}} def __cinit__(self): - self.data = <{{name}}VectorData *>PyMem_Malloc( - sizeof({{name}}VectorData)) - if not self.data: - raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) @@ -223,11 +219,6 @@ cdef class {{name}}Vector(Vector): self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data - def __dealloc__(self): - if self.data is not NULL: - PyMem_Free(self.data) - self.data = NULL - def __len__(self) -> int: return self.data.n @@ -243,13 +234,13 @@ cdef class {{name}}Vector(Vector): cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(self.data): + if needs_resize(self.data.n, self.data.m): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.resize() - append_data_{{dtype}}(self.data, x) + append_data_{{dtype}}(&self.data, x) cdef extend(self, const {{c_type}}[:] x): for i in range(len(x)): @@ -260,12 +251,9 @@ cdef class {{name}}Vector(Vector): cdef class StringVector(Vector): cdef: - StringVectorData *data + StringVectorData data def __cinit__(self): - self.data = PyMem_Malloc(sizeof(StringVectorData)) - if not self.data: - raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) @@ -288,11 +276,7 @@ cdef class StringVector(Vector): self.data.data[i] = orig_data[i] def __dealloc__(self): - if self.data is not NULL: - if self.data.data is not NULL: - free(self.data.data) - PyMem_Free(self.data) - self.data = NULL + free(self.data.data) def __len__(self) -> int: return self.data.n @@ -313,10 +297,10 @@ cdef class StringVector(Vector): cdef void append(self, char *x) noexcept: - if needs_resize(self.data): + if needs_resize(self.data.n, self.data.m): self.resize() - append_data_string(self.data, x) + append_data_string(&self.data, x) cdef extend(self, ndarray[object] x): for i in range(len(x)): @@ -652,7 +636,7 @@ cdef class {{name}}HashTable(HashTable): if return_inverse: labels = np.empty(n, dtype=np.intp) - ud = uniques.data + ud = &uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: @@ -662,7 +646,7 @@ cdef class {{name}}HashTable(HashTable): raise NotImplementedError # pragma: no cover result_mask = UInt8Vector() - rmd = result_mask.data + rmd = &result_mask.data if use_mask: mask_values = mask.view("uint8") @@ -700,7 +684,7 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud): + if needs_resize(ud.n, ud.m): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -722,7 +706,7 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): + if needs_resize(ud.n, ud.m): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -846,7 +830,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud labels = np.empty(n, dtype=np.intp) - ud = uniques.data + ud = &uniques.data with nogil: for i in range(n): @@ -865,7 +849,7 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud): + if needs_resize(ud.n, ud.m): with gil: uniques.resize() append_data_{{dtype}}(ud, val) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 336af306d410f..7bd252a0f43cd 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -472,7 +472,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: kh_{{ttype}}_t *table = kh_init_{{ttype}}() {{name}}Vector idx = {{name}}Vector() ndarray[{{c_type}}, ndim=1] arr - {{name}}VectorData *ud = idx.data + {{name}}VectorData ud = idx.data kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) @@ -480,10 +480,10 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(ud): + if needs_resize(ud.n, ud.m): with gil: idx.resize() - append_data_{{ttype}}(ud, i) + append_data_{{ttype}}(&ud, i) kh_destroy_{{ttype}}(table) From 2d8ad8e8928a78888c52261b7b8536212b692deb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 14:49:08 -0800 Subject: [PATCH 02/15] Remove unused imports --- pandas/_libs/hashtable.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..8250d0242c31f 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,8 +1,4 @@ cimport cython -from cpython.mem cimport ( - PyMem_Free, - PyMem_Malloc, -) from cpython.ref cimport ( Py_INCREF, PyObject, From d2ff43763b435f3c7663709e4118efc1e344851d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 15:14:28 -0800 Subject: [PATCH 03/15] renamed .n -> size, .m -> capacity --- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 82 +++++++++++----------- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/_libs/intervaltree.pxi.in | 10 +-- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 22b923580c491..29ace4a339ced 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -174,7 +174,7 @@ cdef class StringHashTable(HashTable): cdef struct Int64VectorData: int64_t *data - Py_ssize_t n, m + Py_ssize_t size, capacity cdef class Vector: cdef bint external_view_exists diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f1a0e61a57056..ea41c6ee11c8a 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -133,7 +133,7 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), ctypedef struct {{name}}VectorData: {{c_type}} *data - Py_ssize_t n, m + Py_ssize_t size, capacity {{endif}} @@ -143,8 +143,8 @@ ctypedef struct {{name}}VectorData: cdef void append_data_{{dtype}}({{name}}VectorData *data, {{c_type}} x) noexcept nogil: - data.data[data.n] = x - data.n += 1 + data.data[data.size] = x + data.size += 1 {{endfor}} @@ -209,32 +209,32 @@ cdef class {{name}}Vector(Vector): {{endif}} def __cinit__(self): - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) + self.data.size = 0 + self.data.capacity = _INIT_VEC_CAP + self.ao = np.empty(self.data.capacity, dtype=np.{{dtype}}) self.data.data = <{{c_type}}*>self.ao.data cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m, refcheck=False) + self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.capacity, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data def __len__(self) -> int: - return self.data.n + return self.data.size cpdef ndarray to_array(self): - if self.data.m != self.data.n: + if self.data.capacity != self.data.size: if self.external_view_exists: # should never happen raise ValueError("should have raised on append()") - self.ao.resize(self.data.n, refcheck=False) - self.data.m = self.data.n + self.ao.resize(self.data.size, refcheck=False) + self.data.capacity = self.data.size self.external_view_exists = True return self.ao cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(self.data.n, self.data.m): + if needs_resize(self.data.size, self.data.capacity): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") @@ -254,32 +254,32 @@ cdef class StringVector(Vector): StringVectorData data def __cinit__(self): - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.data.data = malloc(self.data.m * sizeof(char *)) + self.data.size = 0 + self.data.capacity = _INIT_VEC_CAP + self.data.data = malloc(self.data.capacity * sizeof(char *)) if not self.data.data: raise MemoryError() cdef resize(self): cdef: char **orig_data - Py_ssize_t i, m + Py_ssize_t i, capacity - m = self.data.m - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + capacity = self.data.capacity + self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP) orig_data = self.data.data - self.data.data = malloc(self.data.m * sizeof(char *)) + self.data.data = malloc(self.data.capacity * sizeof(char *)) if not self.data.data: raise MemoryError() - for i in range(m): + for i in range(capacity): self.data.data[i] = orig_data[i] def __dealloc__(self): free(self.data.data) def __len__(self) -> int: - return self.data.n + return self.data.size cpdef ndarray[object, ndim=1] to_array(self): cdef: @@ -287,17 +287,17 @@ cdef class StringVector(Vector): Py_ssize_t n object val - ao = np.empty(self.data.n, dtype=object) - for i in range(self.data.n): + ao = np.empty(self.data.size, dtype=object) + for i in range(self.data.size): val = self.data.data[i] ao[i] = val self.external_view_exists = True - self.data.m = self.data.n + self.data.capacity = self.data.size return ao cdef void append(self, char *x) noexcept: - if needs_resize(self.data.n, self.data.m): + if needs_resize(self.data.size, self.data.capacity): self.resize() append_data_string(&self.data, x) @@ -311,37 +311,37 @@ cdef class ObjectVector(Vector): cdef: PyObject **data - Py_ssize_t n, m + Py_ssize_t size, capacity ndarray ao def __cinit__(self): - self.n = 0 - self.m = _INIT_VEC_CAP + self.size = 0 + self.capacity = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) self.data = self.ao.data def __len__(self) -> int: - return self.n + return self.size cdef append(self, object obj): - if self.n == self.m: + if self.size == self.capacity: if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m, refcheck=False) + self.size = max(self.capacity * 2, _INIT_VEC_CAP) + self.ao.resize(self.capacity, refcheck=False) self.data = self.ao.data Py_INCREF(obj) - self.data[self.n] = obj - self.n += 1 + self.data[self.size] = obj + self.size += 1 cpdef ndarray[object, ndim=1] to_array(self): - if self.m != self.n: + if self.capacity != self.size: if self.external_view_exists: raise ValueError("should have raised on append()") - self.ao.resize(self.n, refcheck=False) - self.m = self.n + self.ao.resize(self.size, refcheck=False) + self.capacity = self.size self.external_view_exists = True return self.ao @@ -684,7 +684,7 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud.n, ud.m): + if needs_resize(ud.size, ud.capacity): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -706,7 +706,7 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud.n, ud.m): + if needs_resize(ud.size, ud.capacity): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -849,7 +849,7 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud.n, ud.m): + if needs_resize(ud.size, ud.capacity): with gil: uniques.resize() append_data_{{dtype}}(ud, val) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 7bd252a0f43cd..9a9cc5a08473c 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -480,7 +480,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(ud.n, ud.m): + if needs_resize(ud.size, ud.capacity): with gil: idx.resize() append_data_{{ttype}}(&ud, i) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index a6cec0fb30ecc..b94f60c272e5d 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -145,12 +145,12 @@ cdef class IntervalTree(IntervalMixin): # overflow -> no match, which is already handled below pass - if result.data.n == old_len: + if result.data.size == old_len: result.append(-1) - elif result.data.n > old_len + 1: + elif result.data.size > old_len + 1: raise KeyError( 'indexer does not intersect a unique set of intervals') - old_len = result.data.n + old_len = result.data.size return result.to_array().astype('intp') def get_indexer_non_unique(self, ndarray[scalar_t, ndim=1] target): @@ -172,10 +172,10 @@ cdef class IntervalTree(IntervalMixin): # overflow -> no match, which is already handled below pass - if result.data.n == old_len: + if result.data.size == old_len: result.append(-1) missing.append(i) - old_len = result.data.n + old_len = result.data.size return (result.to_array().astype('intp'), missing.to_array().astype('intp')) From 9703000446df7a60b6b62a7e753e0fdc42cc22b7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 15:29:30 -0800 Subject: [PATCH 04/15] size_t -> Py_ssize_t --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index ea41c6ee11c8a..4891582cf096b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -163,7 +163,7 @@ ctypedef fused vector_data: Complex64VectorData StringVectorData -cdef bint needs_resize(size_t size, size_t capacity) noexcept nogil: +cdef bint needs_resize(Py_ssize_t size, Py_ssize_t capacity) noexcept nogil: return size == capacity # ---------------------------------------------------------------------- From 0ed0c61c7da12351a83ad09bb9f0a42d627f1a4b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 15:49:19 -0800 Subject: [PATCH 05/15] revert needs_resize --- pandas/_libs/hashtable_class_helper.pxi.in | 14 +++++++------- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 4891582cf096b..80087e3b761d6 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -163,8 +163,8 @@ ctypedef fused vector_data: Complex64VectorData StringVectorData -cdef bint needs_resize(Py_ssize_t size, Py_ssize_t capacity) noexcept nogil: - return size == capacity +cdef bint needs_resize(const vector_data *data) noexcept nogil: + return data.size == data.capacity # ---------------------------------------------------------------------- # Vector @@ -234,7 +234,7 @@ cdef class {{name}}Vector(Vector): cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(self.data.size, self.data.capacity): + if needs_resize(&self.data): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") @@ -297,7 +297,7 @@ cdef class StringVector(Vector): cdef void append(self, char *x) noexcept: - if needs_resize(self.data.size, self.data.capacity): + if needs_resize(&self.data): self.resize() append_data_string(&self.data, x) @@ -684,7 +684,7 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud.size, ud.capacity): + if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -706,7 +706,7 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud.size, ud.capacity): + if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -849,7 +849,7 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud.size, ud.capacity): + if needs_resize(ud): with gil: uniques.resize() append_data_{{dtype}}(ud, val) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 9a9cc5a08473c..1cd740c1dd1f8 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -480,7 +480,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(ud.size, ud.capacity): + if needs_resize(&ud): with gil: idx.resize() append_data_{{ttype}}(&ud, i) From 0704699bc9f4b4f6b408c2b3e02fd8cde5af8bb4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 15:53:11 -0800 Subject: [PATCH 06/15] remove unnecessary pointers --- pandas/_libs/hashtable_class_helper.pxi.in | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 80087e3b761d6..406833a7f66a0 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -628,15 +628,15 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{c_type}} val, na_value2 khiter_t k - {{name}}VectorData *ud + {{name}}VectorData ud UInt8Vector result_mask - UInt8VectorData *rmd + UInt8VectorData rmd bint use_na_value, use_mask, seen_na = False const uint8_t[:] mask_values if return_inverse: labels = np.empty(n, dtype=np.intp) - ud = &uniques.data + ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: @@ -646,7 +646,7 @@ cdef class {{name}}HashTable(HashTable): raise NotImplementedError # pragma: no cover result_mask = UInt8Vector() - rmd = &result_mask.data + rmd = result_mask.data if use_mask: mask_values = mask.view("uint8") @@ -684,7 +684,7 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud): + if needs_resize(&ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -696,8 +696,8 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(ud, val) - append_data_uint8(rmd, 1) + append_data_{{dtype}}(&ud, val) + append_data_uint8(&rmd, 1) continue k = kh_get_{{dtype}}(self.table, val) @@ -706,7 +706,7 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): + if needs_resize(&ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -721,7 +721,7 @@ cdef class {{name}}HashTable(HashTable): result_mask.resize() append_data_{{dtype}}(ud, val) if use_result_mask: - append_data_uint8(rmd, 0) + append_data_uint8(&rmd, 0) if return_inverse: self.table.vals[k] = count From bf1864d9405950f7190188b4d18cb8730be69b61 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 16:01:19 -0800 Subject: [PATCH 07/15] fix build issues --- pandas/_libs/hashtable_class_helper.pxi.in | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 406833a7f66a0..de2e9bdc55203 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -719,7 +719,7 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(ud, val) + append_data_{{dtype}}(&ud, val) if use_result_mask: append_data_uint8(&rmd, 0) @@ -827,10 +827,10 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val khiter_t k {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud + {{name}}VectorData ud labels = np.empty(n, dtype=np.intp) - ud = &uniques.data + ud = uniques.data with nogil: for i in range(n): @@ -849,10 +849,10 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud): + if needs_resize(&ud): with gil: uniques.resize() - append_data_{{dtype}}(ud, val) + append_data_{{dtype}}(&ud, val) labels[i] = count count += 1 From b114654c983fa94cb28933d72db3ffc73346635d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 20:20:12 -0800 Subject: [PATCH 08/15] Removed ud variable --- pandas/_libs/hashtable_class_helper.pxi.in | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index de2e9bdc55203..1f0dc02ef130b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -628,7 +628,6 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{c_type}} val, na_value2 khiter_t k - {{name}}VectorData ud UInt8Vector result_mask UInt8VectorData rmd bint use_na_value, use_mask, seen_na = False @@ -636,7 +635,6 @@ cdef class {{name}}HashTable(HashTable): if return_inverse: labels = np.empty(n, dtype=np.intp) - ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: @@ -684,7 +682,7 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(&ud): + if needs_resize(&uniques.data): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -696,7 +694,7 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(&ud, val) + append_data_{{dtype}}(&uniques.data, val) append_data_uint8(&rmd, 1) continue @@ -706,7 +704,7 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(&ud): + if needs_resize(&uniques.data): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -719,7 +717,7 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(&ud, val) + append_data_{{dtype}}(&uniques.data, val) if use_result_mask: append_data_uint8(&rmd, 0) From 3590a6b3d7beee8c2a3d7c35476f05267382e75a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 2 Mar 2024 08:40:26 -0800 Subject: [PATCH 09/15] Fix ObjectVector issue --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1f0dc02ef130b..6fa4cb5c4a3ea 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -328,7 +328,7 @@ cdef class ObjectVector(Vector): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.size = max(self.capacity * 2, _INIT_VEC_CAP) + self.capacity = max(self.capacity * 2, _INIT_VEC_CAP) self.ao.resize(self.capacity, refcheck=False) self.data = self.ao.data From d4f24c2d3605c8a8e307bdc86aed5250e069e355 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 2 Mar 2024 09:18:22 -0800 Subject: [PATCH 10/15] try setting NULL in dealloc --- pandas/_libs/hashtable_class_helper.pxi.in | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6fa4cb5c4a3ea..cb7b240e23c3a 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -277,6 +277,7 @@ cdef class StringVector(Vector): def __dealloc__(self): free(self.data.data) + self.data.data = NULL def __len__(self) -> int: return self.data.size From 54ebc26eca6447edd5d01661a67ee354fd3b30c6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 2 Mar 2024 09:53:37 -0800 Subject: [PATCH 11/15] reset things --- pandas/_libs/hashtable.pxd | 4 +- pandas/_libs/hashtable.pyx | 4 + pandas/_libs/hashtable_class_helper.pxi.in | 129 ++++++++++++--------- pandas/_libs/hashtable_func_helper.pxi.in | 6 +- pandas/_libs/intervaltree.pxi.in | 10 +- 5 files changed, 87 insertions(+), 66 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 29ace4a339ced..eaec9e8462450 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -174,13 +174,13 @@ cdef class StringHashTable(HashTable): cdef struct Int64VectorData: int64_t *data - Py_ssize_t size, capacity + Py_ssize_t n, m cdef class Vector: cdef bint external_view_exists cdef class Int64Vector(Vector): - cdef Int64VectorData data + cdef Int64VectorData *data cdef ndarray ao cdef resize(self) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 8250d0242c31f..ccac3d0b50d45 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,4 +1,8 @@ cimport cython +from cpython.mem cimport ( + PyMem_Free, + PyMem_Malloc, +) from cpython.ref cimport ( Py_INCREF, PyObject, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index cb7b240e23c3a..26dcf0b6c4ce3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -133,7 +133,7 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), ctypedef struct {{name}}VectorData: {{c_type}} *data - Py_ssize_t size, capacity + Py_ssize_t n, m {{endif}} @@ -143,8 +143,8 @@ ctypedef struct {{name}}VectorData: cdef void append_data_{{dtype}}({{name}}VectorData *data, {{c_type}} x) noexcept nogil: - data.data[data.size] = x - data.size += 1 + data.data[data.n] = x + data.n += 1 {{endfor}} @@ -163,8 +163,8 @@ ctypedef fused vector_data: Complex64VectorData StringVectorData -cdef bint needs_resize(const vector_data *data) noexcept nogil: - return data.size == data.capacity +cdef bint needs_resize(vector_data *data) noexcept nogil: + return data.n == data.m # ---------------------------------------------------------------------- # Vector @@ -204,43 +204,52 @@ cdef class {{name}}Vector(Vector): # Int64Vector is the only one we need exposed for other cython files. {{if dtype != 'int64'}} cdef: - {{name}}VectorData data + {{name}}VectorData *data ndarray ao {{endif}} def __cinit__(self): - self.data.size = 0 - self.data.capacity = _INIT_VEC_CAP - self.ao = np.empty(self.data.capacity, dtype=np.{{dtype}}) + self.data = <{{name}}VectorData *>PyMem_Malloc( + sizeof({{name}}VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) self.data.data = <{{c_type}}*>self.ao.data cdef resize(self): - self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.capacity, refcheck=False) + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data + def __dealloc__(self): + if self.data is not NULL: + PyMem_Free(self.data) + self.data = NULL + def __len__(self) -> int: - return self.data.size + return self.data.n cpdef ndarray to_array(self): - if self.data.capacity != self.data.size: + if self.data.m != self.data.n: if self.external_view_exists: # should never happen raise ValueError("should have raised on append()") - self.ao.resize(self.data.size, refcheck=False) - self.data.capacity = self.data.size + self.ao.resize(self.data.n, refcheck=False) + self.data.m = self.data.n self.external_view_exists = True return self.ao cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(&self.data): + if needs_resize(self.data): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.resize() - append_data_{{dtype}}(&self.data, x) + append_data_{{dtype}}(self.data, x) cdef extend(self, const {{c_type}}[:] x): for i in range(len(x)): @@ -251,36 +260,42 @@ cdef class {{name}}Vector(Vector): cdef class StringVector(Vector): cdef: - StringVectorData data + StringVectorData *data def __cinit__(self): - self.data.size = 0 - self.data.capacity = _INIT_VEC_CAP - self.data.data = malloc(self.data.capacity * sizeof(char *)) + self.data = PyMem_Malloc(sizeof(StringVectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() cdef resize(self): cdef: char **orig_data - Py_ssize_t i, capacity + Py_ssize_t i, m - capacity = self.data.capacity - self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP) + m = self.data.m + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) orig_data = self.data.data - self.data.data = malloc(self.data.capacity * sizeof(char *)) + self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() - for i in range(capacity): + for i in range(m): self.data.data[i] = orig_data[i] def __dealloc__(self): - free(self.data.data) - self.data.data = NULL + if self.data is not NULL: + if self.data.data is not NULL: + free(self.data.data) + PyMem_Free(self.data) + self.data = NULL def __len__(self) -> int: - return self.data.size + return self.data.n cpdef ndarray[object, ndim=1] to_array(self): cdef: @@ -288,20 +303,20 @@ cdef class StringVector(Vector): Py_ssize_t n object val - ao = np.empty(self.data.size, dtype=object) - for i in range(self.data.size): + ao = np.empty(self.data.n, dtype=object) + for i in range(self.data.n): val = self.data.data[i] ao[i] = val self.external_view_exists = True - self.data.capacity = self.data.size + self.data.m = self.data.n return ao cdef void append(self, char *x) noexcept: - if needs_resize(&self.data): + if needs_resize(self.data): self.resize() - append_data_string(&self.data, x) + append_data_string(self.data, x) cdef extend(self, ndarray[object] x): for i in range(len(x)): @@ -312,37 +327,37 @@ cdef class ObjectVector(Vector): cdef: PyObject **data - Py_ssize_t size, capacity + Py_ssize_t n, m ndarray ao def __cinit__(self): - self.size = 0 - self.capacity = _INIT_VEC_CAP + self.n = 0 + self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) self.data = self.ao.data def __len__(self) -> int: - return self.size + return self.n cdef append(self, object obj): - if self.size == self.capacity: + if self.n == self.m: if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.capacity = max(self.capacity * 2, _INIT_VEC_CAP) - self.ao.resize(self.capacity, refcheck=False) + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m, refcheck=False) self.data = self.ao.data Py_INCREF(obj) - self.data[self.size] = obj - self.size += 1 + self.data[self.n] = obj + self.n += 1 cpdef ndarray[object, ndim=1] to_array(self): - if self.capacity != self.size: + if self.m != self.n: if self.external_view_exists: raise ValueError("should have raised on append()") - self.ao.resize(self.size, refcheck=False) - self.capacity = self.size + self.ao.resize(self.n, refcheck=False) + self.m = self.n self.external_view_exists = True return self.ao @@ -629,13 +644,15 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{c_type}} val, na_value2 khiter_t k + {{name}}VectorData *ud UInt8Vector result_mask - UInt8VectorData rmd + UInt8VectorData *rmd bint use_na_value, use_mask, seen_na = False const uint8_t[:] mask_values if return_inverse: labels = np.empty(n, dtype=np.intp) + ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: @@ -683,7 +700,7 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(&uniques.data): + if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -695,8 +712,8 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(&uniques.data, val) - append_data_uint8(&rmd, 1) + append_data_{{dtype}}(ud, val) + append_data_uint8(rmd, 1) continue k = kh_get_{{dtype}}(self.table, val) @@ -705,7 +722,7 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(&uniques.data): + if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -718,9 +735,9 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(&uniques.data, val) + append_data_{{dtype}}(ud, val) if use_result_mask: - append_data_uint8(&rmd, 0) + append_data_uint8(rmd, 0) if return_inverse: self.table.vals[k] = count @@ -826,7 +843,7 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val khiter_t k {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData ud + {{name}}VectorData *ud labels = np.empty(n, dtype=np.intp) ud = uniques.data @@ -848,10 +865,10 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(&ud): + if needs_resize(ud): with gil: uniques.resize() - append_data_{{dtype}}(&ud, val) + append_data_{{dtype}}(ud, val) labels[i] = count count += 1 diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 1cd740c1dd1f8..336af306d410f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -472,7 +472,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: kh_{{ttype}}_t *table = kh_init_{{ttype}}() {{name}}Vector idx = {{name}}Vector() ndarray[{{c_type}}, ndim=1] arr - {{name}}VectorData ud = idx.data + {{name}}VectorData *ud = idx.data kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) @@ -480,10 +480,10 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(&ud): + if needs_resize(ud): with gil: idx.resize() - append_data_{{ttype}}(&ud, i) + append_data_{{ttype}}(ud, i) kh_destroy_{{ttype}}(table) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index b94f60c272e5d..a6cec0fb30ecc 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -145,12 +145,12 @@ cdef class IntervalTree(IntervalMixin): # overflow -> no match, which is already handled below pass - if result.data.size == old_len: + if result.data.n == old_len: result.append(-1) - elif result.data.size > old_len + 1: + elif result.data.n > old_len + 1: raise KeyError( 'indexer does not intersect a unique set of intervals') - old_len = result.data.size + old_len = result.data.n return result.to_array().astype('intp') def get_indexer_non_unique(self, ndarray[scalar_t, ndim=1] target): @@ -172,10 +172,10 @@ cdef class IntervalTree(IntervalMixin): # overflow -> no match, which is already handled below pass - if result.data.size == old_len: + if result.data.n == old_len: result.append(-1) missing.append(i) - old_len = result.data.size + old_len = result.data.n return (result.to_array().astype('intp'), missing.to_array().astype('intp')) From 04c1748f5ea2830202e065840d534c6b33fc9927 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 2 Mar 2024 10:00:54 -0800 Subject: [PATCH 12/15] try smaller scope --- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 52 +++++++--------------- pandas/_libs/hashtable_func_helper.pxi.in | 6 +-- 3 files changed, 21 insertions(+), 39 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index eaec9e8462450..22b923580c491 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -180,7 +180,7 @@ cdef class Vector: cdef bint external_view_exists cdef class Int64Vector(Vector): - cdef Int64VectorData *data + cdef Int64VectorData data cdef ndarray ao cdef resize(self) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 26dcf0b6c4ce3..c445849db18a7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -204,15 +204,11 @@ cdef class {{name}}Vector(Vector): # Int64Vector is the only one we need exposed for other cython files. {{if dtype != 'int64'}} cdef: - {{name}}VectorData *data + {{name}}VectorData data ndarray ao {{endif}} def __cinit__(self): - self.data = <{{name}}VectorData *>PyMem_Malloc( - sizeof({{name}}VectorData)) - if not self.data: - raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) @@ -223,11 +219,6 @@ cdef class {{name}}Vector(Vector): self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data - def __dealloc__(self): - if self.data is not NULL: - PyMem_Free(self.data) - self.data = NULL - def __len__(self) -> int: return self.data.n @@ -243,13 +234,13 @@ cdef class {{name}}Vector(Vector): cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(self.data): + if needs_resize(&self.data): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.resize() - append_data_{{dtype}}(self.data, x) + append_data_{{dtype}}(&self.data, x) cdef extend(self, const {{c_type}}[:] x): for i in range(len(x)): @@ -260,12 +251,9 @@ cdef class {{name}}Vector(Vector): cdef class StringVector(Vector): cdef: - StringVectorData *data + StringVectorData data def __cinit__(self): - self.data = PyMem_Malloc(sizeof(StringVectorData)) - if not self.data: - raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) @@ -288,11 +276,7 @@ cdef class StringVector(Vector): self.data.data[i] = orig_data[i] def __dealloc__(self): - if self.data is not NULL: - if self.data.data is not NULL: - free(self.data.data) - PyMem_Free(self.data) - self.data = NULL + free(self.data.data) def __len__(self) -> int: return self.data.n @@ -313,10 +297,10 @@ cdef class StringVector(Vector): cdef void append(self, char *x) noexcept: - if needs_resize(self.data): + if needs_resize(&self.data): self.resize() - append_data_string(self.data, x) + append_data_string(&self.data, x) cdef extend(self, ndarray[object] x): for i in range(len(x)): @@ -644,15 +628,13 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{c_type}} val, na_value2 khiter_t k - {{name}}VectorData *ud UInt8Vector result_mask - UInt8VectorData *rmd + UInt8VectorData rmd bint use_na_value, use_mask, seen_na = False const uint8_t[:] mask_values if return_inverse: labels = np.empty(n, dtype=np.intp) - ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: @@ -700,7 +682,7 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud): + if needs_resize(&uniques.data): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -712,8 +694,8 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(ud, val) - append_data_uint8(rmd, 1) + append_data_{{dtype}}(&uniques.data, val) + append_data_uint8(&rmd, 1) continue k = kh_get_{{dtype}}(self.table, val) @@ -722,7 +704,7 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): + if needs_resize(&uniques.data): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -735,9 +717,9 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(ud, val) + append_data_{{dtype}}(&uniques.data, val) if use_result_mask: - append_data_uint8(rmd, 0) + append_data_uint8(&rmd, 0) if return_inverse: self.table.vals[k] = count @@ -843,7 +825,7 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val khiter_t k {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud + {{name}}VectorData ud labels = np.empty(n, dtype=np.intp) ud = uniques.data @@ -865,10 +847,10 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud): + if needs_resize(&ud): with gil: uniques.resize() - append_data_{{dtype}}(ud, val) + append_data_{{dtype}}(&ud, val) labels[i] = count count += 1 diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 336af306d410f..1cd740c1dd1f8 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -472,7 +472,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: kh_{{ttype}}_t *table = kh_init_{{ttype}}() {{name}}Vector idx = {{name}}Vector() ndarray[{{c_type}}, ndim=1] arr - {{name}}VectorData *ud = idx.data + {{name}}VectorData ud = idx.data kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) @@ -480,10 +480,10 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(ud): + if needs_resize(&ud): with gil: idx.resize() - append_data_{{ttype}}(ud, i) + append_data_{{ttype}}(&ud, i) kh_destroy_{{ttype}}(table) From fa05ef2d43c8a17e01168699e62c9bd92192f6a4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 2 Mar 2024 14:51:56 -0800 Subject: [PATCH 13/15] Smaller scope --- pandas/_libs/hashtable_class_helper.pxi.in | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c445849db18a7..629b6b42db852 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -628,13 +628,15 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{c_type}} val, na_value2 khiter_t k + {{name}}VectorData *ud UInt8Vector result_mask - UInt8VectorData rmd + UInt8VectorData *rmd bint use_na_value, use_mask, seen_na = False const uint8_t[:] mask_values if return_inverse: labels = np.empty(n, dtype=np.intp) + ud = &uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: @@ -644,7 +646,7 @@ cdef class {{name}}HashTable(HashTable): raise NotImplementedError # pragma: no cover result_mask = UInt8Vector() - rmd = result_mask.data + rmd = &result_mask.data if use_mask: mask_values = mask.view("uint8") @@ -682,7 +684,7 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(&uniques.data): + if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -694,8 +696,8 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(&uniques.data, val) - append_data_uint8(&rmd, 1) + append_data_{{dtype}}(ud, val) + append_data_uint8(rmd, 1) continue k = kh_get_{{dtype}}(self.table, val) @@ -704,7 +706,7 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(&uniques.data): + if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " @@ -717,9 +719,9 @@ cdef class {{name}}HashTable(HashTable): "result_mask held, but " "Vector.resize() needed") result_mask.resize() - append_data_{{dtype}}(&uniques.data, val) + append_data_{{dtype}}(ud, val) if use_result_mask: - append_data_uint8(&rmd, 0) + append_data_uint8(rmd, 0) if return_inverse: self.table.vals[k] = count @@ -825,10 +827,10 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val khiter_t k {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData ud + {{name}}VectorData *ud labels = np.empty(n, dtype=np.intp) - ud = uniques.data + ud = &uniques.data with nogil: for i in range(n): @@ -847,10 +849,10 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(&ud): + if needs_resize(ud): with gil: uniques.resize() - append_data_{{dtype}}(&ud, val) + append_data_{{dtype}}(ud, val) labels[i] = count count += 1 From 2764636f399409fba5dcc649ef33503abf4b5745 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 2 Mar 2024 14:56:53 -0800 Subject: [PATCH 14/15] less change --- pandas/_libs/hashtable_func_helper.pxi.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 1cd740c1dd1f8..ca1b28b9442ca 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -472,7 +472,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: kh_{{ttype}}_t *table = kh_init_{{ttype}}() {{name}}Vector idx = {{name}}Vector() ndarray[{{c_type}}, ndim=1] arr - {{name}}VectorData ud = idx.data + {{name}}VectorData *ud = &idx.data kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) @@ -480,10 +480,10 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(&ud): + if needs_resize(ud): with gil: idx.resize() - append_data_{{ttype}}(&ud, i) + append_data_{{ttype}}(ud, i) kh_destroy_{{ttype}}(table) From f6c4cd2824af82deded96ee343a88c26bd0c315f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 3 Mar 2024 15:32:18 -0800 Subject: [PATCH 15/15] remove unused --- pandas/_libs/hashtable.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..8250d0242c31f 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,8 +1,4 @@ cimport cython -from cpython.mem cimport ( - PyMem_Free, - PyMem_Malloc, -) from cpython.ref cimport ( Py_INCREF, PyObject,