Skip to content

Less Heap Usage in Hashtable #57701

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Mar 5, 2024
4 changes: 2 additions & 2 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,13 @@ cdef class StringHashTable(HashTable):

cdef struct Int64VectorData:
int64_t *data
Py_ssize_t n, m
Py_ssize_t size, capacity

cdef class Vector:
cdef bint external_view_exists

cdef class Int64Vector(Vector):
cdef Int64VectorData *data
cdef Int64VectorData data
cdef ndarray ao

cdef resize(self)
Expand Down
4 changes: 0 additions & 4 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
cimport cython
from cpython.mem cimport (
PyMem_Free,
PyMem_Malloc,
)
from cpython.ref cimport (
Py_INCREF,
PyObject,
Expand Down
128 changes: 55 additions & 73 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),

ctypedef struct {{name}}VectorData:
{{c_type}} *data
Py_ssize_t n, m
Py_ssize_t size, capacity

{{endif}}

Expand All @@ -143,8 +143,8 @@ ctypedef struct {{name}}VectorData:
cdef void append_data_{{dtype}}({{name}}VectorData *data,
{{c_type}} x) noexcept nogil:

data.data[data.n] = x
data.n += 1
data.data[data.size] = x
data.size += 1

{{endfor}}

Expand All @@ -163,8 +163,8 @@ ctypedef fused vector_data:
Complex64VectorData
StringVectorData

cdef bint needs_resize(vector_data *data) noexcept nogil:
return data.n == data.m
cdef bint needs_resize(const vector_data *data) noexcept nogil:
return data.size == data.capacity

# ----------------------------------------------------------------------
# Vector
Expand Down Expand Up @@ -204,52 +204,43 @@ cdef class {{name}}Vector(Vector):
# Int64Vector is the only one we need exposed for other cython files.
{{if dtype != 'int64'}}
cdef:
{{name}}VectorData *data
{{name}}VectorData data
ndarray ao
{{endif}}

def __cinit__(self):
self.data = <{{name}}VectorData *>PyMem_Malloc(
sizeof({{name}}VectorData))
if not self.data:
raise MemoryError()
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
self.data.size = 0
self.data.capacity = _INIT_VEC_CAP
self.ao = np.empty(self.data.capacity, dtype=np.{{dtype}})
self.data.data = <{{c_type}}*>self.ao.data

cdef resize(self):
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.m, refcheck=False)
self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.capacity, refcheck=False)
self.data.data = <{{c_type}}*>self.ao.data

def __dealloc__(self):
if self.data is not NULL:
PyMem_Free(self.data)
self.data = NULL

def __len__(self) -> int:
return self.data.n
return self.data.size

cpdef ndarray to_array(self):
if self.data.m != self.data.n:
if self.data.capacity != self.data.size:
if self.external_view_exists:
# should never happen
raise ValueError("should have raised on append()")
self.ao.resize(self.data.n, refcheck=False)
self.data.m = self.data.n
self.ao.resize(self.data.size, refcheck=False)
self.data.capacity = self.data.size
self.external_view_exists = True
return self.ao

cdef void append(self, {{c_type}} x) noexcept:

if needs_resize(self.data):
if needs_resize(&self.data):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize()

append_data_{{dtype}}(self.data, x)
append_data_{{dtype}}(&self.data, x)

cdef extend(self, const {{c_type}}[:] x):
for i in range(len(x)):
Expand All @@ -260,63 +251,56 @@ cdef class {{name}}Vector(Vector):
cdef class StringVector(Vector):

cdef:
StringVectorData *data
StringVectorData data

def __cinit__(self):
self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
if not self.data:
raise MemoryError()
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
self.data.size = 0
self.data.capacity = _INIT_VEC_CAP
self.data.data = <char **>malloc(self.data.capacity * sizeof(char *))
if not self.data.data:
raise MemoryError()

cdef resize(self):
cdef:
char **orig_data
Py_ssize_t i, m
Py_ssize_t i, capacity

m = self.data.m
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
capacity = self.data.capacity
self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP)

orig_data = self.data.data
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
self.data.data = <char **>malloc(self.data.capacity * sizeof(char *))
if not self.data.data:
raise MemoryError()
for i in range(m):
for i in range(capacity):
self.data.data[i] = orig_data[i]

def __dealloc__(self):
if self.data is not NULL:
if self.data.data is not NULL:
free(self.data.data)
PyMem_Free(self.data)
self.data = NULL
free(self.data.data)

def __len__(self) -> int:
return self.data.n
return self.data.size

cpdef ndarray[object, ndim=1] to_array(self):
cdef:
ndarray ao
Py_ssize_t n
object val

ao = np.empty(self.data.n, dtype=object)
for i in range(self.data.n):
ao = np.empty(self.data.size, dtype=object)
for i in range(self.data.size):
val = self.data.data[i]
ao[i] = val
self.external_view_exists = True
self.data.m = self.data.n
self.data.capacity = self.data.size
return ao

cdef void append(self, char *x) noexcept:

if needs_resize(self.data):
if needs_resize(&self.data):
self.resize()

append_data_string(self.data, x)
append_data_string(&self.data, x)

cdef extend(self, ndarray[object] x):
for i in range(len(x)):
Expand All @@ -327,37 +311,37 @@ cdef class ObjectVector(Vector):

cdef:
PyObject **data
Py_ssize_t n, m
Py_ssize_t size, capacity
ndarray ao

def __cinit__(self):
self.n = 0
self.m = _INIT_VEC_CAP
self.size = 0
self.capacity = _INIT_VEC_CAP
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
self.data = <PyObject**>self.ao.data

def __len__(self) -> int:
return self.n
return self.size

cdef append(self, object obj):
if self.n == self.m:
if self.size == self.capacity:
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.m = max(self.m * 2, _INIT_VEC_CAP)
self.ao.resize(self.m, refcheck=False)
self.size = max(self.capacity * 2, _INIT_VEC_CAP)
self.ao.resize(self.capacity, refcheck=False)
self.data = <PyObject**>self.ao.data

Py_INCREF(obj)
self.data[self.n] = <PyObject*>obj
self.n += 1
self.data[self.size] = <PyObject*>obj
self.size += 1

cpdef ndarray[object, ndim=1] to_array(self):
if self.m != self.n:
if self.capacity != self.size:
if self.external_view_exists:
raise ValueError("should have raised on append()")
self.ao.resize(self.n, refcheck=False)
self.m = self.n
self.ao.resize(self.size, refcheck=False)
self.capacity = self.size
self.external_view_exists = True
return self.ao

Expand Down Expand Up @@ -644,15 +628,13 @@ cdef class {{name}}HashTable(HashTable):
int ret = 0
{{c_type}} val, na_value2
khiter_t k
{{name}}VectorData *ud
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something very strange was going on here, where changing this to a non-pointer and assigning ud = uniques.data then trying to pass that via reference to needs_resize was failing. I think has something to do with Cython pointer/reference semantics, but rather than dive too deep figured it was trivial enough to just remove

UInt8Vector result_mask
UInt8VectorData *rmd
UInt8VectorData rmd
bint use_na_value, use_mask, seen_na = False
const uint8_t[:] mask_values

if return_inverse:
labels = np.empty(n, dtype=np.intp)
ud = uniques.data
use_na_value = na_value is not None
use_mask = mask is not None
if not use_mask and use_result_mask:
Expand Down Expand Up @@ -700,7 +682,7 @@ cdef class {{name}}HashTable(HashTable):
continue

seen_na = True
if needs_resize(ud):
if needs_resize(&uniques.data):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
Expand All @@ -712,8 +694,8 @@ cdef class {{name}}HashTable(HashTable):
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
append_data_{{dtype}}(ud, val)
append_data_uint8(rmd, 1)
append_data_{{dtype}}(&uniques.data, val)
append_data_uint8(&rmd, 1)
continue

k = kh_get_{{dtype}}(self.table, val)
Expand All @@ -722,7 +704,7 @@ cdef class {{name}}HashTable(HashTable):
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)

if needs_resize(ud):
if needs_resize(&uniques.data):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
Expand All @@ -735,9 +717,9 @@ cdef class {{name}}HashTable(HashTable):
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
append_data_{{dtype}}(ud, val)
append_data_{{dtype}}(&uniques.data, val)
if use_result_mask:
append_data_uint8(rmd, 0)
append_data_uint8(&rmd, 0)

if return_inverse:
self.table.vals[k] = count
Expand Down Expand Up @@ -843,7 +825,7 @@ cdef class {{name}}HashTable(HashTable):
{{c_type}} val
khiter_t k
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud
{{name}}VectorData ud

labels = np.empty(n, dtype=np.intp)
ud = uniques.data
Expand All @@ -865,10 +847,10 @@ cdef class {{name}}HashTable(HashTable):
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = count

if needs_resize(ud):
if needs_resize(&ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
append_data_{{dtype}}(&ud, val)
labels[i] = count
count += 1

Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -472,18 +472,18 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
{{name}}Vector idx = {{name}}Vector()
ndarray[{{c_type}}, ndim=1] arr
{{name}}VectorData *ud = idx.data
{{name}}VectorData ud = idx.data

kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

with nogil:
for i in range(n):
kh_put_{{ttype}}(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
if needs_resize(&ud):
with gil:
idx.resize()
append_data_{{ttype}}(ud, i)
append_data_{{ttype}}(&ud, i)

kh_destroy_{{ttype}}(table)

Expand Down
10 changes: 5 additions & 5 deletions pandas/_libs/intervaltree.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,12 @@ cdef class IntervalTree(IntervalMixin):
# overflow -> no match, which is already handled below
pass

if result.data.n == old_len:
if result.data.size == old_len:
result.append(-1)
elif result.data.n > old_len + 1:
elif result.data.size > old_len + 1:
raise KeyError(
'indexer does not intersect a unique set of intervals')
old_len = result.data.n
old_len = result.data.size
return result.to_array().astype('intp')

def get_indexer_non_unique(self, ndarray[scalar_t, ndim=1] target):
Expand All @@ -172,10 +172,10 @@ cdef class IntervalTree(IntervalMixin):
# overflow -> no match, which is already handled below
pass

if result.data.n == old_len:
if result.data.size == old_len:
result.append(-1)
missing.append(i)
old_len = result.data.n
old_len = result.data.size
return (result.to_array().astype('intp'),
missing.to_array().astype('intp'))

Expand Down