Skip to content

Commit dde7350

Browse files
WillAydpmhatre1
authored andcommitted
Less Heap Usage in Hashtable (pandas-dev#57701)
* Hashtable cleanups * Remove unused imports * renamed .n -> size, .m -> capacity * size_t -> Py_ssize_t * revert needs_resize * remove unnecessary pointers * fix build issues * Removed ud variable * Fix ObjectVector issue * try setting NULL in dealloc * reset things * try smaller scope * Smaller scope * less change * remove unused
1 parent 48640dd commit dde7350

File tree

4 files changed

+12
-32
lines changed

4 files changed

+12
-32
lines changed

pandas/_libs/hashtable.pxd

+1-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ cdef class Vector:
180180
cdef bint external_view_exists
181181

182182
cdef class Int64Vector(Vector):
183-
cdef Int64VectorData *data
183+
cdef Int64VectorData data
184184
cdef ndarray ao
185185

186186
cdef resize(self)

pandas/_libs/hashtable.pyx

-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
11
cimport cython
2-
from cpython.mem cimport (
3-
PyMem_Free,
4-
PyMem_Malloc,
5-
)
62
from cpython.ref cimport (
73
Py_INCREF,
84
PyObject,

pandas/_libs/hashtable_class_helper.pxi.in

+10-26
Original file line numberDiff line numberDiff line change
@@ -204,15 +204,11 @@ cdef class {{name}}Vector(Vector):
204204
# Int64Vector is the only one we need exposed for other cython files.
205205
{{if dtype != 'int64'}}
206206
cdef:
207-
{{name}}VectorData *data
207+
{{name}}VectorData data
208208
ndarray ao
209209
{{endif}}
210210

211211
def __cinit__(self):
212-
self.data = <{{name}}VectorData *>PyMem_Malloc(
213-
sizeof({{name}}VectorData))
214-
if not self.data:
215-
raise MemoryError()
216212
self.data.n = 0
217213
self.data.m = _INIT_VEC_CAP
218214
self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
@@ -223,11 +219,6 @@ cdef class {{name}}Vector(Vector):
223219
self.ao.resize(self.data.m, refcheck=False)
224220
self.data.data = <{{c_type}}*>self.ao.data
225221

226-
def __dealloc__(self):
227-
if self.data is not NULL:
228-
PyMem_Free(self.data)
229-
self.data = NULL
230-
231222
def __len__(self) -> int:
232223
return self.data.n
233224

@@ -243,13 +234,13 @@ cdef class {{name}}Vector(Vector):
243234

244235
cdef void append(self, {{c_type}} x) noexcept:
245236

246-
if needs_resize(self.data):
237+
if needs_resize(&self.data):
247238
if self.external_view_exists:
248239
raise ValueError("external reference but "
249240
"Vector.resize() needed")
250241
self.resize()
251242

252-
append_data_{{dtype}}(self.data, x)
243+
append_data_{{dtype}}(&self.data, x)
253244

254245
cdef extend(self, const {{c_type}}[:] x):
255246
for i in range(len(x)):
@@ -260,12 +251,9 @@ cdef class {{name}}Vector(Vector):
260251
cdef class StringVector(Vector):
261252

262253
cdef:
263-
StringVectorData *data
254+
StringVectorData data
264255

265256
def __cinit__(self):
266-
self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
267-
if not self.data:
268-
raise MemoryError()
269257
self.data.n = 0
270258
self.data.m = _INIT_VEC_CAP
271259
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
@@ -288,11 +276,7 @@ cdef class StringVector(Vector):
288276
self.data.data[i] = orig_data[i]
289277

290278
def __dealloc__(self):
291-
if self.data is not NULL:
292-
if self.data.data is not NULL:
293-
free(self.data.data)
294-
PyMem_Free(self.data)
295-
self.data = NULL
279+
free(self.data.data)
296280

297281
def __len__(self) -> int:
298282
return self.data.n
@@ -313,10 +297,10 @@ cdef class StringVector(Vector):
313297

314298
cdef void append(self, char *x) noexcept:
315299

316-
if needs_resize(self.data):
300+
if needs_resize(&self.data):
317301
self.resize()
318302

319-
append_data_string(self.data, x)
303+
append_data_string(&self.data, x)
320304

321305
cdef extend(self, ndarray[object] x):
322306
for i in range(len(x)):
@@ -652,7 +636,7 @@ cdef class {{name}}HashTable(HashTable):
652636

653637
if return_inverse:
654638
labels = np.empty(n, dtype=np.intp)
655-
ud = uniques.data
639+
ud = &uniques.data
656640
use_na_value = na_value is not None
657641
use_mask = mask is not None
658642
if not use_mask and use_result_mask:
@@ -662,7 +646,7 @@ cdef class {{name}}HashTable(HashTable):
662646
raise NotImplementedError # pragma: no cover
663647

664648
result_mask = UInt8Vector()
665-
rmd = result_mask.data
649+
rmd = &result_mask.data
666650

667651
if use_mask:
668652
mask_values = mask.view("uint8")
@@ -846,7 +830,7 @@ cdef class {{name}}HashTable(HashTable):
846830
{{name}}VectorData *ud
847831

848832
labels = np.empty(n, dtype=np.intp)
849-
ud = uniques.data
833+
ud = &uniques.data
850834

851835
with nogil:
852836
for i in range(n):

pandas/_libs/hashtable_func_helper.pxi.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
472472
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
473473
{{name}}Vector idx = {{name}}Vector()
474474
ndarray[{{c_type}}, ndim=1] arr
475-
{{name}}VectorData *ud = idx.data
475+
{{name}}VectorData *ud = &idx.data
476476

477477
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
478478

0 commit comments

Comments
 (0)