Skip to content

Use memcpy / realloc more effectively in hashtable #57695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Mar 21, 2024
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ cdef class Int64Vector(Vector):
cdef Int64VectorData data
cdef ndarray ao

cdef resize(self)
cdef resize(self, Py_ssize_t new_size)
cpdef ndarray to_array(self)
cdef void append(self, int64_t x) noexcept
cdef extend(self, int64_t[:] x)
1 change: 1 addition & 0 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ from libc.stdlib cimport (
free,
malloc,
)
from libc.string cimport memcpy

import numpy as np

Expand Down
50 changes: 31 additions & 19 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ ctypedef fused vector_data:
Complex64VectorData
StringVectorData

cdef bint needs_resize(vector_data *data) noexcept nogil:
return data.size == data.capacity

cdef bint needs_resize(Py_ssize_t nelems, Py_ssize_t capacity) noexcept nogil:
return nelems >= capacity

# ----------------------------------------------------------------------
# Vector
Expand Down Expand Up @@ -214,8 +215,8 @@ cdef class {{name}}Vector(Vector):
self.ao = np.empty(self.data.capacity, dtype=np.{{dtype}})
self.data.data = <{{c_type}}*>self.ao.data

cdef resize(self):
self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP)
cdef resize(self, Py_ssize_t new_size):
self.data.capacity = max(new_size, _INIT_VEC_CAP)
self.ao.resize(self.data.capacity, refcheck=False)
self.data.data = <{{c_type}}*>self.ao.data

Expand All @@ -234,17 +235,28 @@ cdef class {{name}}Vector(Vector):

cdef void append(self, {{c_type}} x) noexcept:

if needs_resize(&self.data):
if needs_resize(self.data.size, self.data.capacity):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize()
self.resize(self.data.capacity * 4)

append_data_{{dtype}}(&self.data, x)

cdef extend(self, const {{c_type}}[:] x):
for i in range(len(x)):
self.append(x[i])
cdef Py_ssize_t x_size = len(x)
if x_size == 0:
return

cdef Py_ssize_t needed_size = self.data.size + x_size
if needs_resize(needed_size, self.data.capacity):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize(needed_size)

memcpy(self.data.data + self.data.size, &x[0], x_size * sizeof({{c_type}}))
self.data.size = needed_size

{{endfor}}

Expand All @@ -260,7 +272,7 @@ cdef class StringVector(Vector):
if self.data.data is NULL:
raise MemoryError()

cdef resize(self):
cdef resize(self, Py_ssize_t new_size):
cdef:
char **orig_data
Py_ssize_t i, orig_capacity
Expand Down Expand Up @@ -297,8 +309,8 @@ cdef class StringVector(Vector):

cdef void append(self, char *x) noexcept:

if needs_resize(&self.data):
self.resize()
if needs_resize(self.data.size, self.data.capacity):
self.resize(self.data.capacity * 4)

append_data_string(&self.data, x)

Expand Down Expand Up @@ -684,18 +696,18 @@ cdef class {{name}}HashTable(HashTable):
continue

seen_na = True
if needs_resize(ud):
if needs_resize(ud.size, ud.capacity):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
uniques.resize(uniques.data.capacity * 4)
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
result_mask.resize(result_mask.data.capacity * 4)
append_data_{{dtype}}(ud, val)
append_data_uint8(rmd, 1)
continue
Expand All @@ -706,19 +718,19 @@ cdef class {{name}}HashTable(HashTable):
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)

if needs_resize(ud):
if needs_resize(ud.size, ud.capacity):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
uniques.resize(uniques.data.capacity * 4)
if use_result_mask:
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
result_mask.resize(result_mask.data.capacity * 4)
append_data_{{dtype}}(ud, val)
if use_result_mask:
append_data_uint8(rmd, 0)
Expand Down Expand Up @@ -849,9 +861,9 @@ cdef class {{name}}HashTable(HashTable):
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = count

if needs_resize(ud):
if needs_resize(ud.size, ud.capacity):
with gil:
uniques.resize()
uniques.resize(uniques.data.capacity * 4)
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,9 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
for i in range(n):
kh_put_{{ttype}}(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
if needs_resize(ud.size, ud.capacity):
with gil:
idx.resize()
idx.resize(idx.data.capacity * 4)
append_data_{{ttype}}(ud, i)

kh_destroy_{{ttype}}(table)
Expand Down