Skip to content

Commit b69c759

Browse files
committed
reimplement Vector cython class
1 parent faeef89 commit b69c759

File tree

1 file changed

+97
-91
lines changed

1 file changed

+97
-91
lines changed

pandas/hashtable.pyx

+97-91
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
44

55
from khash cimport *
66
from numpy cimport *
7+
from libc.stdlib cimport malloc, free
78

89
from util cimport _checknan
910
cimport util
@@ -33,7 +34,6 @@ cdef extern from "Python.h":
3334
int PySlice_Check(object)
3435

3536
cdef size_t _INIT_VEC_CAP = 32
36-
cdef size_t _USE_GIL = 1000000
3737

3838
def list_to_object_array(list obj):
3939
'''
@@ -52,30 +52,15 @@ def list_to_object_array(list obj):
5252

5353
return arr
5454

55-
5655
cdef class Vector:
57-
58-
cdef:
59-
size_t n, m
60-
ndarray ao
61-
62-
def __len__(self):
63-
return self.n
64-
65-
cdef inline uint8_t needs_resize(self) nogil:
66-
# if we need to resize
67-
return self.n == self.m
68-
69-
def to_array(self):
70-
self.ao.resize(self.n)
71-
self.m = self.n
72-
return self.ao
73-
56+
pass
7457

7558
cdef class ObjectVector(Vector):
7659

7760
cdef:
7861
PyObject **data
62+
size_t n, m
63+
ndarray ao
7964

8065
def __cinit__(self):
8166
self.n = 0
@@ -93,44 +78,66 @@ cdef class ObjectVector(Vector):
9378
self.data[self.n] = <PyObject*> o
9479
self.n += 1
9580

81+
def to_array(self):
82+
self.ao.resize(self.n)
83+
self.m = self.n
84+
return self.ao
85+
86+
87+
ctypedef struct Int64VectorData:
88+
int64_t *data
89+
size_t n, m
90+
91+
cdef uint8_t Int64VectorData_needs_resize(Int64VectorData *data) nogil:
92+
return data.n == data.m
93+
94+
cdef void Int64VectorData_append(Int64VectorData *data, int64_t x) nogil:
9695

97-
cdef class Int64Vector(Vector):
96+
data.data[data.n] = x
97+
data.n += 1
98+
99+
cdef class Int64Vector:
98100

99101
cdef:
100-
int64_t *data
102+
Int64VectorData *data
103+
ndarray ao
101104

102-
def __cinit__(self, int64_t m = -1):
103-
self.n = 0
104-
self.m = _INIT_VEC_CAP if m == -1 else m
105-
self.ao = np.empty(self.m, dtype=np.int64)
106-
self.data = <int64_t*> self.ao.data
105+
def __cinit__(self):
106+
self.data = <Int64VectorData *>malloc(sizeof(Int64VectorData))
107+
self.data.n = 0
108+
self.data.m = _INIT_VEC_CAP
109+
self.ao = np.empty(self.data.m, dtype=np.int64)
110+
self.data.data = <int64_t*> self.ao.data
107111

108112
cdef resize(self):
109-
self.m = max(self.m * 4, _INIT_VEC_CAP)
110-
self.ao.resize(self.m)
111-
self.data = <int64_t*> self.ao.data
113+
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
114+
self.ao.resize(self.data.m)
115+
self.data.data = <int64_t*> self.ao.data
112116

113-
cdef inline void append_nogil(self, int64_t x) nogil:
117+
def __dealloc__(self):
118+
free(self.data)
114119

115-
if self.needs_resize():
116-
with gil:
117-
self.resize()
120+
def __len__(self):
121+
return self.data.n
118122

119-
self.data[self.n] = x
120-
self.n += 1
123+
def to_array(self):
124+
self.ao.resize(self.data.n)
125+
self.data.m = self.data.n
126+
return self.ao
121127

122128
cdef inline void append(self, int64_t x):
123129

124-
if self.needs_resize():
130+
if Int64VectorData_needs_resize(self.data):
125131
self.resize()
126132

127-
self.data[self.n] = x
128-
self.n += 1
133+
Int64VectorData_append(self.data, x)
129134

130135
cdef class Float64Vector(Vector):
131136

132137
cdef:
133138
float64_t *data
139+
size_t n, m
140+
ndarray ao
134141

135142
def __cinit__(self):
136143
self.n = 0
@@ -144,13 +151,18 @@ cdef class Float64Vector(Vector):
144151
self.data = <float64_t*> self.ao.data
145152

146153
cdef inline void append(self, float64_t x) nogil:
147-
if self.needs_resize():
154+
if self.n == self.m:
148155
with gil:
149156
self.resize()
150157

151158
self.data[self.n] = x
152159
self.n += 1
153160

161+
def to_array(self):
162+
self.ao.resize(self.n)
163+
self.m = self.n
164+
return self.ao
165+
154166

155167
cdef class HashTable:
156168
pass
@@ -370,25 +382,12 @@ cdef class Int64HashTable(HashTable):
370382
int ret = 0
371383
int64_t val
372384
khiter_t k
385+
Int64VectorData *ud
373386

374387
labels = np.empty(n, dtype=np.int64)
388+
ud = uniques.data
375389

376-
if n > _USE_GIL:
377-
with nogil:
378-
for i in range(n):
379-
val = values[i]
380-
k = kh_get_int64(self.table, val)
381-
if k != self.table.n_buckets:
382-
idx = self.table.vals[k]
383-
labels[i] = idx
384-
else:
385-
k = kh_put_int64(self.table, val, &ret)
386-
self.table.vals[k] = count
387-
uniques.append_nogil(val)
388-
labels[i] = count
389-
count += 1
390-
391-
else:
390+
with nogil:
392391
for i in range(n):
393392
val = values[i]
394393
k = kh_get_int64(self.table, val)
@@ -398,7 +397,11 @@ cdef class Int64HashTable(HashTable):
398397
else:
399398
k = kh_put_int64(self.table, val, &ret)
400399
self.table.vals[k] = count
401-
uniques.append(val)
400+
401+
if Int64VectorData_needs_resize(ud):
402+
with gil:
403+
uniques.resize()
404+
Int64VectorData_append(ud, val)
402405
labels[i] = count
403406
count += 1
404407

@@ -414,8 +417,10 @@ cdef class Int64HashTable(HashTable):
414417
int64_t val
415418
khiter_t k
416419
Int64Vector uniques = Int64Vector()
420+
Int64VectorData *ud
417421

418422
labels = np.empty(n, dtype=np.int64)
423+
ud = uniques.data
419424

420425
with nogil:
421426
for i in range(n):
@@ -433,7 +438,11 @@ cdef class Int64HashTable(HashTable):
433438
else:
434439
k = kh_put_int64(self.table, val, &ret)
435440
self.table.vals[k] = count
436-
uniques.append_nogil(val)
441+
442+
if Int64VectorData_needs_resize(ud):
443+
with gil:
444+
uniques.resize()
445+
Int64VectorData_append(ud, val)
437446
labels[i] = count
438447
count += 1
439448

@@ -451,13 +460,12 @@ cdef class Int64HashTable(HashTable):
451460
khiter_t k
452461
Int64Vector uniques = Int64Vector()
453462

454-
with nogil:
455-
for i in range(n):
456-
val = values[i]
457-
k = kh_get_int64(self.table, val)
458-
if k == self.table.n_buckets:
459-
kh_put_int64(self.table, val, &ret)
460-
uniques.append_nogil(val)
463+
for i in range(n):
464+
val = values[i]
465+
k = kh_get_int64(self.table, val)
466+
if k == self.table.n_buckets:
467+
kh_put_int64(self.table, val, &ret)
468+
uniques.append(val)
461469

462470
result = uniques.to_array()
463471

@@ -521,24 +529,23 @@ cdef class Float64HashTable(HashTable):
521529

522530
labels = np.empty(n, dtype=np.int64)
523531

524-
with nogil:
525-
for i in range(n):
526-
val = values[i]
532+
for i in range(n):
533+
val = values[i]
527534

528-
if val != val:
529-
labels[i] = na_sentinel
530-
continue
535+
if val != val:
536+
labels[i] = na_sentinel
537+
continue
531538

532-
k = kh_get_float64(self.table, val)
533-
if k != self.table.n_buckets:
534-
idx = self.table.vals[k]
535-
labels[i] = idx
536-
else:
537-
k = kh_put_float64(self.table, val, &ret)
538-
self.table.vals[k] = count
539-
uniques.append(val)
540-
labels[i] = count
541-
count += 1
539+
k = kh_get_float64(self.table, val)
540+
if k != self.table.n_buckets:
541+
idx = self.table.vals[k]
542+
labels[i] = idx
543+
else:
544+
k = kh_put_float64(self.table, val, &ret)
545+
self.table.vals[k] = count
546+
uniques.append(val)
547+
labels[i] = count
548+
count += 1
542549

543550
return labels
544551

@@ -584,18 +591,17 @@ cdef class Float64HashTable(HashTable):
584591
Float64Vector uniques = Float64Vector()
585592
bint seen_na = 0
586593

587-
with nogil:
588-
for i in range(n):
589-
val = values[i]
594+
for i in range(n):
595+
val = values[i]
590596

591-
if val == val:
592-
k = kh_get_float64(self.table, val)
593-
if k == self.table.n_buckets:
594-
kh_put_float64(self.table, val, &ret)
595-
uniques.append(val)
596-
elif not seen_na:
597-
seen_na = 1
598-
uniques.append(NAN)
597+
if val == val:
598+
k = kh_get_float64(self.table, val)
599+
if k == self.table.n_buckets:
600+
kh_put_float64(self.table, val, &ret)
601+
uniques.append(val)
602+
elif not seen_na:
603+
seen_na = 1
604+
uniques.append(NAN)
599605

600606
return uniques.to_array()
601607

0 commit comments

Comments
 (0)