Skip to content

Less Heap Usage in Hashtable #57701

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Mar 5, 2024
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ cdef class Vector:
cdef bint external_view_exists

cdef class Int64Vector(Vector):
cdef Int64VectorData *data
cdef Int64VectorData data
cdef ndarray ao

cdef resize(self)
Expand Down
4 changes: 0 additions & 4 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
cimport cython
from cpython.mem cimport (
PyMem_Free,
PyMem_Malloc,
)
from cpython.ref cimport (
Py_INCREF,
PyObject,
Expand Down
36 changes: 10 additions & 26 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -204,15 +204,11 @@ cdef class {{name}}Vector(Vector):
# Int64Vector is the only one we need exposed for other cython files.
{{if dtype != 'int64'}}
cdef:
{{name}}VectorData *data
{{name}}VectorData data
ndarray ao
{{endif}}

def __cinit__(self):
self.data = <{{name}}VectorData *>PyMem_Malloc(
sizeof({{name}}VectorData))
if not self.data:
raise MemoryError()
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
Expand All @@ -223,11 +219,6 @@ cdef class {{name}}Vector(Vector):
self.ao.resize(self.data.m, refcheck=False)
self.data.data = <{{c_type}}*>self.ao.data

def __dealloc__(self):
if self.data is not NULL:
PyMem_Free(self.data)
self.data = NULL

def __len__(self) -> int:
return self.data.n

Expand All @@ -243,13 +234,13 @@ cdef class {{name}}Vector(Vector):

cdef void append(self, {{c_type}} x) noexcept:

if needs_resize(self.data):
if needs_resize(&self.data):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize()

append_data_{{dtype}}(self.data, x)
append_data_{{dtype}}(&self.data, x)

cdef extend(self, const {{c_type}}[:] x):
for i in range(len(x)):
Expand All @@ -260,12 +251,9 @@ cdef class {{name}}Vector(Vector):
cdef class StringVector(Vector):

cdef:
StringVectorData *data
StringVectorData data

def __cinit__(self):
self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
if not self.data:
raise MemoryError()
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
Expand All @@ -288,11 +276,7 @@ cdef class StringVector(Vector):
self.data.data[i] = orig_data[i]

def __dealloc__(self):
if self.data is not NULL:
if self.data.data is not NULL:
free(self.data.data)
PyMem_Free(self.data)
self.data = NULL
free(self.data.data)

def __len__(self) -> int:
return self.data.n
Expand All @@ -313,10 +297,10 @@ cdef class StringVector(Vector):

cdef void append(self, char *x) noexcept:

if needs_resize(self.data):
if needs_resize(&self.data):
self.resize()

append_data_string(self.data, x)
append_data_string(&self.data, x)

cdef extend(self, ndarray[object] x):
for i in range(len(x)):
Expand Down Expand Up @@ -652,7 +636,7 @@ cdef class {{name}}HashTable(HashTable):

if return_inverse:
labels = np.empty(n, dtype=np.intp)
ud = uniques.data
ud = &uniques.data
use_na_value = na_value is not None
use_mask = mask is not None
if not use_mask and use_result_mask:
Expand All @@ -662,7 +646,7 @@ cdef class {{name}}HashTable(HashTable):
raise NotImplementedError # pragma: no cover

result_mask = UInt8Vector()
rmd = result_mask.data
rmd = &result_mask.data

if use_mask:
mask_values = mask.view("uint8")
Expand Down Expand Up @@ -846,7 +830,7 @@ cdef class {{name}}HashTable(HashTable):
{{name}}VectorData *ud

labels = np.empty(n, dtype=np.intp)
ud = uniques.data
ud = &uniques.data

with nogil:
for i in range(n):
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
{{name}}Vector idx = {{name}}Vector()
ndarray[{{c_type}}, ndim=1] arr
{{name}}VectorData *ud = idx.data
{{name}}VectorData *ud = &idx.data

kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

Expand Down