Skip to content

Commit 7394190

Browse files
committed
reimplement Vector cython class
1 parent 07fbe21 commit 7394190

File tree

5 files changed

+185
-176
lines changed

5 files changed

+185
-176
lines changed

pandas/core/algorithms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
131131
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
132132

133133
table = hash_klass(size_hint or len(vals))
134-
uniques = vec_klass(len(vals))
134+
uniques = vec_klass()
135135
labels = table.get_labels(vals, uniques, 0, na_sentinel)
136136

137137
labels = com._ensure_platform_int(labels)

pandas/hashtable.pyx

+132-95
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
44

55
from khash cimport *
66
from numpy cimport *
7+
from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
78

89
from util cimport _checknan
910
cimport util
@@ -33,56 +34,23 @@ cdef extern from "Python.h":
3334
int PySlice_Check(object)
3435

3536
cdef size_t _INIT_VEC_CAP = 32
36-
cdef size_t _USE_GIL = 1000000
37-
38-
def list_to_object_array(list obj):
39-
'''
40-
Convert list to object ndarray. Seriously can't believe I had to write this
41-
function
42-
'''
43-
cdef:
44-
Py_ssize_t i, n
45-
ndarray[object] arr
46-
47-
n = len(obj)
48-
arr = np.empty(n, dtype=object)
49-
50-
for i from 0 <= i < n:
51-
arr[i] = obj[i]
52-
53-
return arr
5437

55-
56-
cdef class Vector:
38+
cdef class ObjectVector:
5739

5840
cdef:
41+
PyObject **data
5942
size_t n, m
6043
ndarray ao
6144

62-
def __len__(self):
63-
return self.n
64-
65-
cdef inline uint8_t needs_resize(self) nogil:
66-
# if we need to resize
67-
return self.n == self.m
68-
69-
def to_array(self):
70-
self.ao.resize(self.n)
71-
self.m = self.n
72-
return self.ao
73-
74-
75-
cdef class ObjectVector(Vector):
76-
77-
cdef:
78-
PyObject **data
79-
8045
def __cinit__(self):
8146
self.n = 0
8247
self.m = _INIT_VEC_CAP
8348
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
8449
self.data = <PyObject**> self.ao.data
8550

51+
def __len__(self):
52+
return self.n
53+
8654
cdef inline append(self, object o):
8755
if self.n == self.m:
8856
self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -93,64 +61,111 @@ cdef class ObjectVector(Vector):
9361
self.data[self.n] = <PyObject*> o
9462
self.n += 1
9563

64+
def to_array(self):
65+
self.ao.resize(self.n)
66+
self.m = self.n
67+
return self.ao
68+
69+
70+
ctypedef struct Int64VectorData:
71+
int64_t *data
72+
size_t n, m
73+
74+
cdef uint8_t Int64VectorData_needs_resize(Int64VectorData *data) nogil:
75+
return data.n == data.m
76+
77+
cdef void Int64VectorData_append(Int64VectorData *data, int64_t x) nogil:
78+
79+
data.data[data.n] = x
80+
data.n += 1
9681

97-
cdef class Int64Vector(Vector):
82+
cdef class Int64Vector:
9883

9984
cdef:
100-
int64_t *data
85+
Int64VectorData *data
86+
ndarray ao
10187

102-
def __cinit__(self, int64_t m = -1):
103-
self.n = 0
104-
self.m = _INIT_VEC_CAP if m == -1 else m
105-
self.ao = np.empty(self.m, dtype=np.int64)
106-
self.data = <int64_t*> self.ao.data
88+
def __cinit__(self):
89+
self.data = <Int64VectorData *>PyMem_Malloc(sizeof(Int64VectorData))
90+
if not self.data:
91+
raise MemoryError()
92+
self.data.n = 0
93+
self.data.m = _INIT_VEC_CAP
94+
self.ao = np.empty(self.data.m, dtype=np.int64)
95+
self.data.data = <int64_t*> self.ao.data
10796

10897
cdef resize(self):
109-
self.m = max(self.m * 4, _INIT_VEC_CAP)
110-
self.ao.resize(self.m)
111-
self.data = <int64_t*> self.ao.data
98+
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
99+
self.ao.resize(self.data.m)
100+
self.data.data = <int64_t*> self.ao.data
112101

113-
cdef inline void append_nogil(self, int64_t x) nogil:
102+
def __dealloc__(self):
103+
PyMem_Free(self.data)
114104

115-
if self.needs_resize():
116-
with gil:
117-
self.resize()
105+
def __len__(self):
106+
return self.data.n
118107

119-
self.data[self.n] = x
120-
self.n += 1
108+
def to_array(self):
109+
self.ao.resize(self.data.n)
110+
self.data.m = self.data.n
111+
return self.ao
121112

122113
cdef inline void append(self, int64_t x):
123114

124-
if self.needs_resize():
115+
if Int64VectorData_needs_resize(self.data):
125116
self.resize()
126117

127-
self.data[self.n] = x
128-
self.n += 1
118+
Int64VectorData_append(self.data, x)
129119

130-
cdef class Float64Vector(Vector):
120+
ctypedef struct Float64VectorData:
121+
float64_t *data
122+
size_t n, m
123+
124+
cdef uint8_t Float64VectorData_needs_resize(Float64VectorData *data) nogil:
125+
return data.n == data.m
126+
127+
cdef void Float64VectorData_append(Float64VectorData *data, float64_t x) nogil:
128+
129+
data.data[data.n] = x
130+
data.n += 1
131+
132+
cdef class Float64Vector:
131133

132134
cdef:
133-
float64_t *data
135+
Float64VectorData *data
136+
ndarray ao
134137

135138
def __cinit__(self):
136-
self.n = 0
137-
self.m = _INIT_VEC_CAP
138-
self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
139-
self.data = <float64_t*> self.ao.data
139+
self.data = <Float64VectorData *>PyMem_Malloc(sizeof(Float64VectorData))
140+
if not self.data:
141+
raise MemoryError()
142+
self.data.n = 0
143+
self.data.m = _INIT_VEC_CAP
144+
self.ao = np.empty(self.data.m, dtype=np.float64)
145+
self.data.data = <float64_t*> self.ao.data
140146

141147
cdef resize(self):
142-
self.m = max(self.m * 2, _INIT_VEC_CAP)
143-
self.ao.resize(self.m)
144-
self.data = <float64_t*> self.ao.data
148+
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
149+
self.ao.resize(self.data.m)
150+
self.data.data = <float64_t*> self.ao.data
145151

146-
cdef inline void append(self, float64_t x) nogil:
147-
if self.needs_resize():
148-
with gil:
149-
self.resize()
152+
def __dealloc__(self):
153+
PyMem_Free(self.data)
150154

151-
self.data[self.n] = x
152-
self.n += 1
155+
def __len__(self):
156+
return self.data.n
153157

158+
def to_array(self):
159+
self.ao.resize(self.data.n)
160+
self.data.m = self.data.n
161+
return self.ao
162+
163+
cdef inline void append(self, float64_t x):
164+
165+
if Float64VectorData_needs_resize(self.data):
166+
self.resize()
167+
168+
Float64VectorData_append(self.data, x)
154169

155170
cdef class HashTable:
156171
pass
@@ -370,25 +385,12 @@ cdef class Int64HashTable(HashTable):
370385
int ret = 0
371386
int64_t val
372387
khiter_t k
388+
Int64VectorData *ud
373389

374390
labels = np.empty(n, dtype=np.int64)
391+
ud = uniques.data
375392

376-
if n > _USE_GIL:
377-
with nogil:
378-
for i in range(n):
379-
val = values[i]
380-
k = kh_get_int64(self.table, val)
381-
if k != self.table.n_buckets:
382-
idx = self.table.vals[k]
383-
labels[i] = idx
384-
else:
385-
k = kh_put_int64(self.table, val, &ret)
386-
self.table.vals[k] = count
387-
uniques.append_nogil(val)
388-
labels[i] = count
389-
count += 1
390-
391-
else:
393+
with nogil:
392394
for i in range(n):
393395
val = values[i]
394396
k = kh_get_int64(self.table, val)
@@ -398,7 +400,11 @@ cdef class Int64HashTable(HashTable):
398400
else:
399401
k = kh_put_int64(self.table, val, &ret)
400402
self.table.vals[k] = count
401-
uniques.append(val)
403+
404+
if Int64VectorData_needs_resize(ud):
405+
with gil:
406+
uniques.resize()
407+
Int64VectorData_append(ud, val)
402408
labels[i] = count
403409
count += 1
404410

@@ -414,8 +420,10 @@ cdef class Int64HashTable(HashTable):
414420
int64_t val
415421
khiter_t k
416422
Int64Vector uniques = Int64Vector()
423+
Int64VectorData *ud
417424

418425
labels = np.empty(n, dtype=np.int64)
426+
ud = uniques.data
419427

420428
with nogil:
421429
for i in range(n):
@@ -433,7 +441,11 @@ cdef class Int64HashTable(HashTable):
433441
else:
434442
k = kh_put_int64(self.table, val, &ret)
435443
self.table.vals[k] = count
436-
uniques.append_nogil(val)
444+
445+
if Int64VectorData_needs_resize(ud):
446+
with gil:
447+
uniques.resize()
448+
Int64VectorData_append(ud, val)
437449
labels[i] = count
438450
count += 1
439451

@@ -450,14 +462,21 @@ cdef class Int64HashTable(HashTable):
450462
int64_t val
451463
khiter_t k
452464
Int64Vector uniques = Int64Vector()
465+
Int64VectorData *ud
466+
467+
ud = uniques.data
453468

454469
with nogil:
455470
for i in range(n):
456471
val = values[i]
457472
k = kh_get_int64(self.table, val)
458473
if k == self.table.n_buckets:
459474
kh_put_int64(self.table, val, &ret)
460-
uniques.append_nogil(val)
475+
476+
if Int64VectorData_needs_resize(ud):
477+
with gil:
478+
uniques.resize()
479+
Int64VectorData_append(ud, val)
461480

462481
result = uniques.to_array()
463482

@@ -518,8 +537,10 @@ cdef class Float64HashTable(HashTable):
518537
int ret = 0
519538
float64_t val
520539
khiter_t k
540+
Float64VectorData *ud
521541

522542
labels = np.empty(n, dtype=np.int64)
543+
ud = uniques.data
523544

524545
with nogil:
525546
for i in range(n):
@@ -536,7 +557,11 @@ cdef class Float64HashTable(HashTable):
536557
else:
537558
k = kh_put_float64(self.table, val, &ret)
538559
self.table.vals[k] = count
539-
uniques.append(val)
560+
561+
if Float64VectorData_needs_resize(ud):
562+
with gil:
563+
uniques.resize()
564+
Float64VectorData_append(ud, val)
540565
labels[i] = count
541566
count += 1
542567

@@ -581,8 +606,11 @@ cdef class Float64HashTable(HashTable):
581606
int ret = 0
582607
float64_t val
583608
khiter_t k
584-
Float64Vector uniques = Float64Vector()
585609
bint seen_na = 0
610+
Float64Vector uniques = Float64Vector()
611+
Float64VectorData *ud
612+
613+
ud = uniques.data
586614

587615
with nogil:
588616
for i in range(n):
@@ -592,10 +620,19 @@ cdef class Float64HashTable(HashTable):
592620
k = kh_get_float64(self.table, val)
593621
if k == self.table.n_buckets:
594622
kh_put_float64(self.table, val, &ret)
595-
uniques.append(val)
623+
624+
if Float64VectorData_needs_resize(ud):
625+
with gil:
626+
uniques.resize()
627+
Float64VectorData_append(ud, val)
628+
596629
elif not seen_na:
597630
seen_na = 1
598-
uniques.append(NAN)
631+
632+
if Float64VectorData_needs_resize(ud):
633+
with gil:
634+
uniques.resize()
635+
Float64VectorData_append(ud, NAN)
599636

600637
return uniques.to_array()
601638

pandas/index.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -233,14 +233,13 @@ cdef class IndexEngine:
233233
cdef inline _do_monotonic_check(self):
234234
try:
235235
values = self._get_index_values()
236-
self.monotonic_inc, self.monotonic_dec, self.unique = \
236+
self.monotonic_inc, self.monotonic_dec = \
237237
self._call_monotonic(values)
238238
except TypeError:
239239
self.monotonic_inc = 0
240240
self.monotonic_dec = 0
241241

242242
self.monotonic_check = 1
243-
self.unique_check = 1
244243

245244
cdef _get_index_values(self):
246245
return self.vgetter()
@@ -269,7 +268,8 @@ cdef class IndexEngine:
269268

270269
if len(self.mapping) == len(values):
271270
self.unique = 1
272-
self.unique_check = 1
271+
self.unique_check = 1
272+
273273
self.initialized = 1
274274

275275
def clear_mapping(self):

0 commit comments

Comments
 (0)