Skip to content

Commit f20c880

Browse files
committed
reimplement Vector cython class
1 parent 2fe35db commit f20c880

File tree

1 file changed

+128
-92
lines changed

1 file changed

+128
-92
lines changed

pandas/hashtable.pyx

+128-92
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
44

55
from khash cimport *
66
from numpy cimport *
7+
from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
78

89
from util cimport _checknan
910
cimport util
@@ -33,56 +34,26 @@ cdef extern from "Python.h":
3334
int PySlice_Check(object)
3435

3536
cdef size_t _INIT_VEC_CAP = 32
36-
cdef size_t _USE_GIL = 1000000
37-
38-
def list_to_object_array(list obj):
39-
'''
40-
Convert list to object ndarray. Seriously can't believe I had to write this
41-
function
42-
'''
43-
cdef:
44-
Py_ssize_t i, n
45-
ndarray[object] arr
46-
47-
n = len(obj)
48-
arr = np.empty(n, dtype=object)
49-
50-
for i from 0 <= i < n:
51-
arr[i] = obj[i]
52-
53-
return arr
54-
5537

5638
cdef class Vector:
57-
58-
cdef:
59-
size_t n, m
60-
ndarray ao
61-
62-
def __len__(self):
63-
return self.n
64-
65-
cdef inline uint8_t needs_resize(self) nogil:
66-
# if we need to resize
67-
return self.n == self.m
68-
69-
def to_array(self):
70-
self.ao.resize(self.n)
71-
self.m = self.n
72-
return self.ao
73-
39+
pass
7440

7541
cdef class ObjectVector(Vector):
7642

7743
cdef:
7844
PyObject **data
45+
size_t n, m
46+
ndarray ao
7947

8048
def __cinit__(self):
8149
self.n = 0
8250
self.m = _INIT_VEC_CAP
8351
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
8452
self.data = <PyObject**> self.ao.data
8553

54+
def __len__(self):
55+
return self.n
56+
8657
cdef inline append(self, object o):
8758
if self.n == self.m:
8859
self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -93,64 +64,107 @@ cdef class ObjectVector(Vector):
9364
self.data[self.n] = <PyObject*> o
9465
self.n += 1
9566

67+
def to_array(self):
68+
self.ao.resize(self.n)
69+
self.m = self.n
70+
return self.ao
71+
72+
73+
ctypedef struct Int64VectorData:
74+
int64_t *data
75+
size_t n, m
76+
77+
cdef uint8_t Int64VectorData_needs_resize(Int64VectorData *data) nogil:
78+
return data.n == data.m
79+
80+
cdef void Int64VectorData_append(Int64VectorData *data, int64_t x) nogil:
9681

97-
cdef class Int64Vector(Vector):
82+
data.data[data.n] = x
83+
data.n += 1
84+
85+
cdef class Int64Vector:
9886

9987
cdef:
100-
int64_t *data
88+
Int64VectorData *data
89+
ndarray ao
10190

102-
def __cinit__(self, int64_t m = -1):
103-
self.n = 0
104-
self.m = _INIT_VEC_CAP if m == -1 else m
105-
self.ao = np.empty(self.m, dtype=np.int64)
106-
self.data = <int64_t*> self.ao.data
91+
def __cinit__(self):
92+
self.data = <Int64VectorData *>PyMem_Malloc(sizeof(Int64VectorData))
93+
self.data.n = 0
94+
self.data.m = _INIT_VEC_CAP
95+
self.ao = np.empty(self.data.m, dtype=np.int64)
96+
self.data.data = <int64_t*> self.ao.data
10797

10898
cdef resize(self):
109-
self.m = max(self.m * 4, _INIT_VEC_CAP)
110-
self.ao.resize(self.m)
111-
self.data = <int64_t*> self.ao.data
99+
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
100+
self.ao.resize(self.data.m)
101+
self.data.data = <int64_t*> self.ao.data
112102

113-
cdef inline void append_nogil(self, int64_t x) nogil:
103+
def __dealloc__(self):
104+
PyMem_Free(self.data)
114105

115-
if self.needs_resize():
116-
with gil:
117-
self.resize()
106+
def __len__(self):
107+
return self.data.n
118108

119-
self.data[self.n] = x
120-
self.n += 1
109+
def to_array(self):
110+
self.ao.resize(self.data.n)
111+
self.data.m = self.data.n
112+
return self.ao
121113

122114
cdef inline void append(self, int64_t x):
123115

124-
if self.needs_resize():
116+
if Int64VectorData_needs_resize(self.data):
125117
self.resize()
126118

127-
self.data[self.n] = x
128-
self.n += 1
119+
Int64VectorData_append(self.data, x)
120+
121+
ctypedef struct Float64VectorData:
122+
float64_t *data
123+
size_t n, m
124+
125+
cdef uint8_t Float64VectorData_needs_resize(Float64VectorData *data) nogil:
126+
return data.n == data.m
127+
128+
cdef void Float64VectorData_append(Float64VectorData *data, float64_t x) nogil:
129+
130+
data.data[data.n] = x
131+
data.n += 1
129132

130133
cdef class Float64Vector(Vector):
131134

132135
cdef:
133-
float64_t *data
136+
Float64VectorData *data
137+
ndarray ao
134138

135139
def __cinit__(self):
136-
self.n = 0
137-
self.m = _INIT_VEC_CAP
138-
self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
139-
self.data = <float64_t*> self.ao.data
140+
self.data = <Float64VectorData *>PyMem_Malloc(sizeof(Float64VectorData))
141+
self.data.n = 0
142+
self.data.m = _INIT_VEC_CAP
143+
self.ao = np.empty(self.data.m, dtype=np.float64)
144+
self.data.data = <float64_t*> self.ao.data
140145

141146
cdef resize(self):
142-
self.m = max(self.m * 2, _INIT_VEC_CAP)
143-
self.ao.resize(self.m)
144-
self.data = <float64_t*> self.ao.data
147+
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
148+
self.ao.resize(self.data.m)
149+
self.data.data = <float64_t*> self.ao.data
145150

146-
cdef inline void append(self, float64_t x) nogil:
147-
if self.needs_resize():
148-
with gil:
149-
self.resize()
151+
def __dealloc__(self):
152+
PyMem_Free(self.data)
150153

151-
self.data[self.n] = x
152-
self.n += 1
154+
def __len__(self):
155+
return self.data.n
156+
157+
def to_array(self):
158+
self.ao.resize(self.data.n)
159+
self.data.m = self.data.n
160+
return self.ao
153161

162+
cdef inline void append(self, float64_t x):
163+
164+
if Float64VectorData_needs_resize(self.data):
165+
self.resize()
166+
167+
Float64VectorData_append(self.data, x)
154168

155169
cdef class HashTable:
156170
pass
@@ -370,25 +384,12 @@ cdef class Int64HashTable(HashTable):
370384
int ret = 0
371385
int64_t val
372386
khiter_t k
387+
Int64VectorData *ud
373388

374389
labels = np.empty(n, dtype=np.int64)
390+
ud = uniques.data
375391

376-
if n > _USE_GIL:
377-
with nogil:
378-
for i in range(n):
379-
val = values[i]
380-
k = kh_get_int64(self.table, val)
381-
if k != self.table.n_buckets:
382-
idx = self.table.vals[k]
383-
labels[i] = idx
384-
else:
385-
k = kh_put_int64(self.table, val, &ret)
386-
self.table.vals[k] = count
387-
uniques.append_nogil(val)
388-
labels[i] = count
389-
count += 1
390-
391-
else:
392+
with nogil:
392393
for i in range(n):
393394
val = values[i]
394395
k = kh_get_int64(self.table, val)
@@ -398,7 +399,11 @@ cdef class Int64HashTable(HashTable):
398399
else:
399400
k = kh_put_int64(self.table, val, &ret)
400401
self.table.vals[k] = count
401-
uniques.append(val)
402+
403+
if Int64VectorData_needs_resize(ud):
404+
with gil:
405+
uniques.resize()
406+
Int64VectorData_append(ud, val)
402407
labels[i] = count
403408
count += 1
404409

@@ -414,8 +419,10 @@ cdef class Int64HashTable(HashTable):
414419
int64_t val
415420
khiter_t k
416421
Int64Vector uniques = Int64Vector()
422+
Int64VectorData *ud
417423

418424
labels = np.empty(n, dtype=np.int64)
425+
ud = uniques.data
419426

420427
with nogil:
421428
for i in range(n):
@@ -433,7 +440,11 @@ cdef class Int64HashTable(HashTable):
433440
else:
434441
k = kh_put_int64(self.table, val, &ret)
435442
self.table.vals[k] = count
436-
uniques.append_nogil(val)
443+
444+
if Int64VectorData_needs_resize(ud):
445+
with gil:
446+
uniques.resize()
447+
Int64VectorData_append(ud, val)
437448
labels[i] = count
438449
count += 1
439450

@@ -450,14 +461,21 @@ cdef class Int64HashTable(HashTable):
450461
int64_t val
451462
khiter_t k
452463
Int64Vector uniques = Int64Vector()
464+
Int64VectorData *ud
465+
466+
ud = uniques.data
453467

454468
with nogil:
455469
for i in range(n):
456470
val = values[i]
457471
k = kh_get_int64(self.table, val)
458472
if k == self.table.n_buckets:
459473
kh_put_int64(self.table, val, &ret)
460-
uniques.append_nogil(val)
474+
475+
if Int64VectorData_needs_resize(ud):
476+
with gil:
477+
uniques.resize()
478+
Int64VectorData_append(ud, val)
461479

462480
result = uniques.to_array()
463481

@@ -518,8 +536,10 @@ cdef class Float64HashTable(HashTable):
518536
int ret = 0
519537
float64_t val
520538
khiter_t k
539+
Float64VectorData *ud
521540

522541
labels = np.empty(n, dtype=np.int64)
542+
ud = uniques.data
523543

524544
with nogil:
525545
for i in range(n):
@@ -536,7 +556,11 @@ cdef class Float64HashTable(HashTable):
536556
else:
537557
k = kh_put_float64(self.table, val, &ret)
538558
self.table.vals[k] = count
539-
uniques.append(val)
559+
560+
if Float64VectorData_needs_resize(ud):
561+
with gil:
562+
uniques.resize()
563+
Float64VectorData_append(ud, val)
540564
labels[i] = count
541565
count += 1
542566

@@ -581,8 +605,11 @@ cdef class Float64HashTable(HashTable):
581605
int ret = 0
582606
float64_t val
583607
khiter_t k
584-
Float64Vector uniques = Float64Vector()
585608
bint seen_na = 0
609+
Float64Vector uniques = Float64Vector()
610+
Float64VectorData *ud
611+
612+
ud = uniques.data
586613

587614
with nogil:
588615
for i in range(n):
@@ -592,10 +619,19 @@ cdef class Float64HashTable(HashTable):
592619
k = kh_get_float64(self.table, val)
593620
if k == self.table.n_buckets:
594621
kh_put_float64(self.table, val, &ret)
595-
uniques.append(val)
622+
623+
if Float64VectorData_needs_resize(ud):
624+
with gil:
625+
uniques.resize()
626+
Float64VectorData_append(ud, val)
627+
596628
elif not seen_na:
597629
seen_na = 1
598-
uniques.append(NAN)
630+
631+
if Float64VectorData_needs_resize(ud):
632+
with gil:
633+
uniques.resize()
634+
Float64VectorData_append(ud, NAN)
599635

600636
return uniques.to_array()
601637

0 commit comments

Comments
 (0)