From 9885a51a701fb1d62272454f85345f9e7048d9e8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 10:13:33 -0800 Subject: [PATCH 01/13] Use memcpy / realloc more effectively in hashtable --- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable.pyx | 2 + pandas/_libs/hashtable_class_helper.pxi.in | 54 ++++++++++++---------- pandas/_libs/hashtable_func_helper.pxi.in | 4 +- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index eaec9e8462450..7feaec723810c 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -183,7 +183,7 @@ cdef class Int64Vector(Vector): cdef Int64VectorData *data cdef ndarray ao - cdef resize(self) + cdef resize(self, size_t new_size) cpdef ndarray to_array(self) cdef void append(self, int64_t x) noexcept cdef extend(self, int64_t[:] x) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..4c3a003473ceb 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -10,7 +10,9 @@ from cpython.ref cimport ( from libc.stdlib cimport ( free, malloc, + realloc, ) +from libc.string cimport memcpy import numpy as np diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 26dcf0b6c4ce3..19c2447abb1ac 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -163,8 +163,8 @@ ctypedef fused vector_data: Complex64VectorData StringVectorData -cdef bint needs_resize(vector_data *data) noexcept nogil: - return data.n == data.m +cdef bint needs_resize(size_t nelems, size_t capacity) noexcept nogil: + return nelems >= capacity # ---------------------------------------------------------------------- # Vector @@ -218,9 +218,9 @@ cdef class {{name}}Vector(Vector): self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) self.data.data = <{{c_type}}*>self.ao.data - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m, refcheck=False) + cdef resize(self, size_t new_size): + self.data.m = max(new_size, _INIT_VEC_CAP) + self.ao.resize(new_size, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data def __dealloc__(self): @@ -243,17 +243,24 @@ cdef class {{name}}Vector(Vector): cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(self.data): + if needs_resize(self.data.n, self.data.m): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.resize() + self.resize(self.data.m * 4) append_data_{{dtype}}(self.data, x) cdef extend(self, const {{c_type}}[:] x): - for i in range(len(x)): - self.append(x[i]) + x_size = len(x) + new_size = self.data.n + x_size + if needs_resize(new_size, self.data.m): + if self.external_view_exists: + raise ValueError("external reference but " + "Vector.resize() needed") + self.resize(new_size) # TODO: do we want to multiply by 4? + + memcpy(&self.data[0] + self.data.n, &x[0], x_size * sizeof({{c_type}})) {{endfor}} @@ -272,20 +279,17 @@ cdef class StringVector(Vector): if not self.data.data: raise MemoryError() - cdef resize(self): + cdef resize(self, size_t new_size): cdef: char **orig_data Py_ssize_t i, m m = self.data.m - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.data.m = max(new_size, _INIT_VEC_CAP) - orig_data = self.data.data - self.data.data = malloc(self.data.m * sizeof(char *)) + self.data.data = realloc(self.data, self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() - for i in range(m): - self.data.data[i] = orig_data[i] def __dealloc__(self): if self.data is not NULL: @@ -313,8 +317,8 @@ cdef class StringVector(Vector): cdef void append(self, char *x) noexcept: - if needs_resize(self.data): - self.resize() + if needs_resize(self.data.n, self.data.m): + self.resize(self.data.m * 4) append_data_string(self.data, x) @@ -700,18 +704,18 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud): + if needs_resize(ud.n, ud.m): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") - uniques.resize() + uniques.resize(uniques.data.m * 4) if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") - result_mask.resize() + result_mask.resize(result_mask.data.m * 4) append_data_{{dtype}}(ud, val) append_data_uint8(rmd, 1) continue @@ -722,19 +726,19 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): + if needs_resize(ud.n, ud.m): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") - uniques.resize() + uniques.resize(uniques.data.m * 4) if use_result_mask: if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") - result_mask.resize() + result_mask.resize(result_mask.data.m * 4) append_data_{{dtype}}(ud, val) if use_result_mask: append_data_uint8(rmd, 0) @@ -865,9 +869,9 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud): + if needs_resize(ud.n, ud.m): with gil: - uniques.resize() + uniques.resize(uniques.data.m * 4) append_data_{{dtype}}(ud, val) labels[i] = count count += 1 diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 336af306d410f..504fcec94ba22 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -480,9 +480,9 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(ud): + if needs_resize(ud.n, ud.m): with gil: - idx.resize() + idx.resize(idx.data.m * 4) append_data_{{ttype}}(ud, i) kh_destroy_{{ttype}}(table) From c10cc4403cbdb41dee12767ffc6a9f67fc8e2c9f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 11:30:18 -0800 Subject: [PATCH 02/13] Try fix --- pandas/_libs/hashtable_class_helper.pxi.in | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 19c2447abb1ac..aa745150d0dbc 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -280,14 +280,8 @@ cdef class StringVector(Vector): raise MemoryError() cdef resize(self, size_t new_size): - cdef: - char **orig_data - Py_ssize_t i, m - - m = self.data.m self.data.m = max(new_size, _INIT_VEC_CAP) - - self.data.data = realloc(self.data, self.data.m * sizeof(char *)) + self.data.data = realloc(self.data.data, self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() From b330fad36c6e817918779e2016fc3c297d7452e3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 13:39:55 -0800 Subject: [PATCH 03/13] skip test --- pandas/tests/test_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 9c2b9a76bbb83..f5f44bbc6b99d 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -222,6 +222,7 @@ def test_temp_setattr(with_exception): assert ser.name == "first" +@pytest.mark.skip("seeing if this is the only issue") @pytest.mark.single_cpu def test_str_size(): # GH#21758 From 61955f10bc2a02360efe6337f57180a372f34e85 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 14:47:56 -0800 Subject: [PATCH 04/13] revert StringVector realloc --- pandas/_libs/hashtable.pyx | 1 - pandas/_libs/hashtable_class_helper.pxi.in | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 4c3a003473ceb..5f14c1ffa7c04 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -10,7 +10,6 @@ from cpython.ref cimport ( from libc.stdlib cimport ( free, malloc, - realloc, ) from libc.string cimport memcpy diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index aa745150d0dbc..9814d2078b793 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -280,10 +280,21 @@ cdef class StringVector(Vector): raise MemoryError() cdef resize(self, size_t new_size): + cdef: self.data.m = max(new_size, _INIT_VEC_CAP) + char **orig_data self.data.data = realloc(self.data.data, self.data.m * sizeof(char *)) + Py_ssize_t i, m + + m = self.data.m + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + + orig_data = self.data.data + self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() + for i in range(m): + self.data.data[i] = orig_data[i] def __dealloc__(self): if self.data is not NULL: From 40359fdb30f83091f722578c8a053360c58cb73d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 14:55:47 -0800 Subject: [PATCH 05/13] fixes --- pandas/_libs/hashtable_class_helper.pxi.in | 4 +--- pandas/tests/test_common.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 9814d2078b793..6216e9cfd9135 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -220,7 +220,7 @@ cdef class {{name}}Vector(Vector): cdef resize(self, size_t new_size): self.data.m = max(new_size, _INIT_VEC_CAP) - self.ao.resize(new_size, refcheck=False) + self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data def __dealloc__(self): @@ -281,9 +281,7 @@ cdef class StringVector(Vector): cdef resize(self, size_t new_size): cdef: - self.data.m = max(new_size, _INIT_VEC_CAP) char **orig_data - self.data.data = realloc(self.data.data, self.data.m * sizeof(char *)) Py_ssize_t i, m m = self.data.m diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index f5f44bbc6b99d..9c2b9a76bbb83 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -222,7 +222,6 @@ def test_temp_setattr(with_exception): assert ser.name == "first" -@pytest.mark.skip("seeing if this is the only issue") @pytest.mark.single_cpu def test_str_size(): # GH#21758 From d9b2d65ea23f130a3bead7f37429a98130be4e05 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 15:21:59 -0800 Subject: [PATCH 06/13] Fix missing size increment --- pandas/_libs/hashtable_class_helper.pxi.in | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6216e9cfd9135..3c46af663a3c6 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -261,6 +261,7 @@ cdef class {{name}}Vector(Vector): self.resize(new_size) # TODO: do we want to multiply by 4? memcpy(&self.data[0] + self.data.n, &x[0], x_size * sizeof({{c_type}})) + self.data.n += x_size {{endfor}} From e40826bd671c467c4b67afbad175c6e4e9c4db28 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Mar 2024 15:28:19 -0800 Subject: [PATCH 07/13] size_t -> Py_ssize_t --- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 7feaec723810c..7d724a377191c 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -183,7 +183,7 @@ cdef class Int64Vector(Vector): cdef Int64VectorData *data cdef ndarray ao - cdef resize(self, size_t new_size) + cdef resize(self, Py_ssize_t new_size) cpdef ndarray to_array(self) cdef void append(self, int64_t x) noexcept cdef extend(self, int64_t[:] x) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3c46af663a3c6..c38ad378b3926 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -163,7 +163,7 @@ ctypedef fused vector_data: Complex64VectorData StringVectorData -cdef bint needs_resize(size_t nelems, size_t capacity) noexcept nogil: +cdef bint needs_resize(Py_ssize_t nelems, Py_ssize_t capacity) noexcept nogil: return nelems >= capacity # ---------------------------------------------------------------------- @@ -218,7 +218,7 @@ cdef class {{name}}Vector(Vector): self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) self.data.data = <{{c_type}}*>self.ao.data - cdef resize(self, size_t new_size): + cdef resize(self, Py_ssize_t new_size): self.data.m = max(new_size, _INIT_VEC_CAP) self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data @@ -280,7 +280,7 @@ cdef class StringVector(Vector): if not self.data.data: raise MemoryError() - cdef resize(self, size_t new_size): + cdef resize(self, Py_ssize_t new_size): cdef: char **orig_data Py_ssize_t i, m From 946a203da20952a0cb2fc3acf2e7bc9632d4ea0d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 2 Mar 2024 08:56:25 -0800 Subject: [PATCH 08/13] cleanups --- pandas/_libs/hashtable_class_helper.pxi.in | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c38ad378b3926..4a75f3110fbbe 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -252,16 +252,16 @@ cdef class {{name}}Vector(Vector): append_data_{{dtype}}(self.data, x) cdef extend(self, const {{c_type}}[:] x): - x_size = len(x) - new_size = self.data.n + x_size - if needs_resize(new_size, self.data.m): + cdef Py_ssize_t x_size = len(x) + cdef Py_ssize_t needed_size = self.data.n + x_size + if needs_resize(needed_size, self.data.m): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.resize(new_size) # TODO: do we want to multiply by 4? + self.resize(needed_size) - memcpy(&self.data[0] + self.data.n, &x[0], x_size * sizeof({{c_type}})) - self.data.n += x_size + memcpy(&self.dataself[self.data.n], &x[0], x_size * sizeof({{c_type}})) + self.data.n = needed_size {{endfor}} From 03bae727b992e4dd9ddd76a3d15a4cd9f2269a90 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 19 Mar 2024 18:21:21 -0400 Subject: [PATCH 09/13] Fixed typo --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 734dea3da31d5..3b7879b0c7f88 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -251,7 +251,7 @@ cdef class {{name}}Vector(Vector): "Vector.resize() needed") self.resize(needed_size) - memcpy(&self.dataself[self.data.n], &x[0], x_size * sizeof({{c_type}})) + memcpy(&self.data[self.data.n], &x[0], x_size * sizeof({{c_type}})) self.data.n = needed_size {{endfor}} From 7d3dee2b69b473062d83ab86a11395b4b2302eae Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 19 Mar 2024 18:25:36 -0400 Subject: [PATCH 10/13] Fixes --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3b7879b0c7f88..37bfb6a94335d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -251,7 +251,7 @@ cdef class {{name}}Vector(Vector): "Vector.resize() needed") self.resize(needed_size) - memcpy(&self.data[self.data.n], &x[0], x_size * sizeof({{c_type}})) + memcpy(self.data.data + self.data.n, &x[0], x_size * sizeof({{c_type}})) self.data.n = needed_size {{endfor}} From 6703c166b404853dd4b469a29e068eb3e9e9d30c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 20 Mar 2024 16:55:08 -0400 Subject: [PATCH 11/13] merge fixups --- pandas/_libs/hashtable_class_helper.pxi.in | 32 +++++++++++----------- pandas/_libs/hashtable_func_helper.pxi.in | 4 +-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 2f7bb2bee2d16..807c596ce0f2b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -235,25 +235,25 @@ cdef class {{name}}Vector(Vector): cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(self.data.n, self.data.m): + if needs_resize(self.data.size, self.data.capacity): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") - self.resize(self.data.m * 4) + self.resize(self.data.capacity * 4) append_data_{{dtype}}(&self.data, x) cdef extend(self, const {{c_type}}[:] x): cdef Py_ssize_t x_size = len(x) - cdef Py_ssize_t needed_size = self.data.n + x_size - if needs_resize(needed_size, self.data.m): + cdef Py_ssize_t needed_size = self.data.size + x_size + if needs_resize(needed_size, self.data.capacity): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.resize(needed_size) - memcpy(self.data.data + self.data.n, &x[0], x_size * sizeof({{c_type}})) - self.data.n = needed_size + memcpy(self.data.data + self.data.size, &x[0], x_size * sizeof({{c_type}})) + self.data.size = needed_size {{endfor}} @@ -306,8 +306,8 @@ cdef class StringVector(Vector): cdef void append(self, char *x) noexcept: - if needs_resize(self.data.n, self.data.m): - self.resize(self.data.m * 4) + if needs_resize(self.data.size, self.data.capacity): + self.resize(self.data.capacity * 4) append_data_string(&self.data, x) @@ -693,18 +693,18 @@ cdef class {{name}}HashTable(HashTable): continue seen_na = True - if needs_resize(ud.n, ud.m): + if needs_resize(ud.size, ud.capacity): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") - uniques.resize(uniques.data.m * 4) + uniques.resize(uniques.data.capacity * 4) if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") - result_mask.resize(result_mask.data.m * 4) + result_mask.resize(result_mask.data.capacity * 4) append_data_{{dtype}}(ud, val) append_data_uint8(rmd, 1) continue @@ -715,19 +715,19 @@ cdef class {{name}}HashTable(HashTable): # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud.n, ud.m): + if needs_resize(ud.size, ud.capacity): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") - uniques.resize(uniques.data.m * 4) + uniques.resize(uniques.data.capacity * 4) if use_result_mask: if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") - result_mask.resize(result_mask.data.m * 4) + result_mask.resize(result_mask.data.capacity * 4) append_data_{{dtype}}(ud, val) if use_result_mask: append_data_uint8(rmd, 0) @@ -858,9 +858,9 @@ cdef class {{name}}HashTable(HashTable): k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count - if needs_resize(ud.n, ud.m): + if needs_resize(ud.size, ud.capacity): with gil: - uniques.resize(uniques.data.m * 4) + uniques.resize(uniques.data.capacity * 4) append_data_{{dtype}}(ud, val) labels[i] = count count += 1 diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 5beee46d1ce43..5500fadb73b6d 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -480,9 +480,9 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: for i in range(n): kh_put_{{ttype}}(table, labels[i], &ret) if ret != 0: - if needs_resize(ud.n, ud.m): + if needs_resize(ud.size, ud.capacity): with gil: - idx.resize(idx.data.m * 4) + idx.resize(idx.data.capacity * 4) append_data_{{ttype}}(ud, i) kh_destroy_{{ttype}}(table) From dd08e40b717d296779897040867065561d37cf12 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 20 Mar 2024 17:01:04 -0400 Subject: [PATCH 12/13] early return --- pandas/_libs/hashtable_class_helper.pxi.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 807c596ce0f2b..ae7f9f93a839d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -245,6 +245,9 @@ cdef class {{name}}Vector(Vector): cdef extend(self, const {{c_type}}[:] x): cdef Py_ssize_t x_size = len(x) + if x_size == 0: + return + cdef Py_ssize_t needed_size = self.data.size + x_size if needs_resize(needed_size, self.data.capacity): if self.external_view_exists: From cfa5b85fcecd57a9d3b40faf4f8d21a5573486b6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 20 Mar 2024 20:22:12 -0400 Subject: [PATCH 13/13] fix merge issue --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index ae7f9f93a839d..f9abd574dae01 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -216,7 +216,7 @@ cdef class {{name}}Vector(Vector): self.data.data = <{{c_type}}*>self.ao.data cdef resize(self, Py_ssize_t new_size): - self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP) + self.data.capacity = max(new_size, _INIT_VEC_CAP) self.ao.resize(self.data.capacity, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data