Skip to content

Use memcpy / realloc more effectively in hashtable #57695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Mar 21, 2024
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ cdef class Int64Vector(Vector):
cdef Int64VectorData *data
cdef ndarray ao

cdef resize(self)
cdef resize(self, size_t new_size)
cpdef ndarray to_array(self)
cdef void append(self, int64_t x) noexcept
cdef extend(self, int64_t[:] x)
2 changes: 2 additions & 0 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ from cpython.ref cimport (
from libc.stdlib cimport (
free,
malloc,
realloc,
)
from libc.string cimport memcpy

import numpy as np

Expand Down
54 changes: 29 additions & 25 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,8 @@ ctypedef fused vector_data:
Complex64VectorData
StringVectorData

cdef bint needs_resize(vector_data *data) noexcept nogil:
return data.n == data.m
cdef bint needs_resize(size_t nelems, size_t capacity) noexcept nogil:
return nelems >= capacity

# ----------------------------------------------------------------------
# Vector
Expand Down Expand Up @@ -218,9 +218,9 @@ cdef class {{name}}Vector(Vector):
self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
self.data.data = <{{c_type}}*>self.ao.data

cdef resize(self):
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.m, refcheck=False)
cdef resize(self, size_t new_size):
self.data.m = max(new_size, _INIT_VEC_CAP)
self.ao.resize(new_size, refcheck=False)
self.data.data = <{{c_type}}*>self.ao.data

def __dealloc__(self):
Expand All @@ -243,17 +243,24 @@ cdef class {{name}}Vector(Vector):

cdef void append(self, {{c_type}} x) noexcept:

if needs_resize(self.data):
if needs_resize(self.data.n, self.data.m):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize()
self.resize(self.data.m * 4)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The multiplying by 4 seems somewhat arbitrary and counter-productive for large arrays, but keeping status quo for now


append_data_{{dtype}}(self.data, x)

cdef extend(self, const {{c_type}}[:] x):
for i in range(len(x)):
self.append(x[i])
x_size = len(x)
new_size = self.data.n + x_size
if needs_resize(new_size, self.data.m):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize(new_size) # TODO: do we want to multiply by 4?
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the extend I choose not to multiple by a factor of 4. This would favor performance when doing really large appends versus many small appends in a loop

Not sure how to best manage this. Long term I would love to get rid of this self-managed code and use the C++ standard library


memcpy(&self.data[0] + self.data.n, &x[0], x_size * sizeof({{c_type}}))

{{endfor}}

Expand All @@ -272,20 +279,17 @@ cdef class StringVector(Vector):
if not self.data.data:
raise MemoryError()

cdef resize(self):
cdef resize(self, size_t new_size):
cdef:
char **orig_data
Py_ssize_t i, m

m = self.data.m
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.data.m = max(new_size, _INIT_VEC_CAP)

orig_data = self.data.data
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
self.data.data = <char **>realloc(self.data, self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()
for i in range(m):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are still a few other places where we are doing loop assignment in the object class, but I have not touched them because I don't know what Cython does for reference counting during assignment that a simple mempcy would not know about. Something to research futher

self.data.data[i] = orig_data[i]

def __dealloc__(self):
if self.data is not NULL:
Expand Down Expand Up @@ -313,8 +317,8 @@ cdef class StringVector(Vector):

cdef void append(self, char *x) noexcept:

if needs_resize(self.data):
self.resize()
if needs_resize(self.data.n, self.data.m):
self.resize(self.data.m * 4)

append_data_string(self.data, x)

Expand Down Expand Up @@ -700,18 +704,18 @@ cdef class {{name}}HashTable(HashTable):
continue

seen_na = True
if needs_resize(ud):
if needs_resize(ud.n, ud.m):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
uniques.resize(uniques.data.m * 4)
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
result_mask.resize(result_mask.data.m * 4)
append_data_{{dtype}}(ud, val)
append_data_uint8(rmd, 1)
continue
Expand All @@ -722,19 +726,19 @@ cdef class {{name}}HashTable(HashTable):
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)

if needs_resize(ud):
if needs_resize(ud.n, ud.m):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
uniques.resize(uniques.data.m * 4)
if use_result_mask:
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
result_mask.resize(result_mask.data.m * 4)
append_data_{{dtype}}(ud, val)
if use_result_mask:
append_data_uint8(rmd, 0)
Expand Down Expand Up @@ -865,9 +869,9 @@ cdef class {{name}}HashTable(HashTable):
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = count

if needs_resize(ud):
if needs_resize(ud.n, ud.m):
with gil:
uniques.resize()
uniques.resize(uniques.data.m * 4)
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,9 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
for i in range(n):
kh_put_{{ttype}}(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
if needs_resize(ud.n, ud.m):
with gil:
idx.resize()
idx.resize(idx.data.m * 4)
append_data_{{ttype}}(ud, i)

kh_destroy_{{ttype}}(table)
Expand Down