Skip to content

Commit 2907506

Browse files
committed
COMPAT/PERF remove arrays from Vector classes
1 parent 39cc1d0 commit 2907506

File tree

3 files changed

+51
-37
lines changed

3 files changed

+51
-37
lines changed

pandas/_libs/hashtable.pxd

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,9 @@ cdef struct Int64VectorData:
5252
cdef class Int64Vector:
5353
cdef Int64VectorData *data
5454
cdef ndarray ao
55+
cdef bint external_view_exists
5556

56-
cdef resize(self)
57+
cdef void resize(self) nogil
5758
cpdef to_array(self)
5859
cdef inline void append(self, int64_t x)
5960
cdef extend(self, int64_t[:] x)

pandas/_libs/hashtable.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
55
from khash cimport *
66
from numpy cimport *
77

8-
from libc.stdlib cimport malloc, free
8+
from libc.stdlib cimport malloc, free, realloc
99
from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
1010
PyString_Check, PyBytes_Check,
1111
PyUnicode_Check)
1212

13-
from util cimport _checknan
13+
from util cimport _checknan, set_array_owndata
1414
cimport util
1515

1616
import numpy as np

pandas/_libs/hashtable_class_helper.pxi.in

+47-34
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ cdef inline bint needs_resize(vector_data *data) nogil:
5858

5959
{{py:
6060

61-
# name, dtype, arg, idtype
62-
dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'),
63-
('UInt64', 'uint64', 'uint64_t', 'np.uint64'),
64-
('Int64', 'int64', 'int64_t', 'np.int64')]
61+
# name, dtype, arg, type_id
62+
dtypes = [('Float64', 'float64', 'float64_t', 'cnp.NPY_FLOAT64'),
63+
('UInt64', 'uint64', 'uint64_t', 'cnp.NPY_UINT64'),
64+
('Int64', 'int64', 'int64_t', 'cnp.NPY_INT64')]
6565

6666
}}
6767

@@ -71,6 +71,7 @@ cdef class {{name}}Vector:
7171

7272
{{if dtype != 'int64'}}
7373
cdef:
74+
bint external_view_exists
7475
{{name}}VectorData *data
7576
ndarray ao
7677
{{endif}}
@@ -80,28 +81,41 @@ cdef class {{name}}Vector:
8081
sizeof({{name}}VectorData))
8182
if not self.data:
8283
raise MemoryError()
84+
self.external_view_exists = False
8385
self.data.n = 0
8486
self.data.m = _INIT_VEC_CAP
85-
self.ao = np.empty(self.data.m, dtype={{idtype}})
86-
self.data.data = <{{arg}}*> self.ao.data
87+
self.data.data = <{{arg}}*> malloc(self.data.m * sizeof({{arg}}))
8788

88-
cdef resize(self):
89+
cdef void resize(self) nogil:
90+
# TODO: handle failure to allocate
8991
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
90-
self.ao.resize(self.data.m)
91-
self.data.data = <{{arg}}*> self.ao.data
92+
self.data.data = <{{arg}}*> realloc(self.data.data, self.data.m * sizeof({{arg}}))
9293

9394
def __dealloc__(self):
9495
if self.data is not NULL:
96+
if not self.external_view_exists and self.data.data:
97+
# buffer was never returned as array, so free
98+
free(self.data.data)
9599
PyMem_Free(self.data)
96100
self.data = NULL
97101

98102
def __len__(self):
99103
return self.data.n
100104

101105
cpdef to_array(self):
102-
self.ao.resize(self.data.n)
103-
self.data.m = self.data.n
104-
return self.ao
106+
cdef:
107+
ndarray ao
108+
cnp.npy_intp shape[1]
109+
if self.external_view_exists:
110+
raise ValueError("Vector.to_array() can only be called once")
111+
112+
self.data.data = <{{arg}}*> realloc(self.data.data, self.data.n * sizeof({{arg}}))
113+
self.external_view_exists = True
114+
shape[0] = self.data.n
115+
ao = cnp.PyArray_SimpleNewFromData(1, shape, {{idtype}}, <void*>self.data.data)
116+
# ownership transfer so numpy eventually frees
117+
set_array_owndata(ao)
118+
return ao
105119

106120
cdef inline void append(self, {{arg}} x):
107121

@@ -120,32 +134,28 @@ cdef class StringVector:
120134

121135
cdef:
122136
StringVectorData *data
137+
bint external_view_exists
123138

124139
def __cinit__(self):
125140
self.data = <StringVectorData *>PyMem_Malloc(
126141
sizeof(StringVectorData))
127142
if not self.data:
128143
raise MemoryError()
144+
self.external_view_exists = False
129145
self.data.n = 0
130146
self.data.m = _INIT_VEC_CAP
131147
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
132148

133-
cdef resize(self):
134-
cdef:
135-
char **orig_data
136-
size_t i, m
137-
138-
m = self.data.m
149+
cdef void resize(self) nogil:
139150
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
140151

141-
# TODO: can resize?
142-
orig_data = self.data.data
143-
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
144-
for i in range(m):
145-
self.data.data[i] = orig_data[i]
152+
self.data.data = <char **> realloc(self.data.data,
153+
self.data.m * sizeof(char *))
146154

147155
def __dealloc__(self):
148156
if self.data is not NULL:
157+
# unlike numeric vectors, strings were copied
158+
# into python objects so always free
149159
if self.data.data is not NULL:
150160
free(self.data.data)
151161
PyMem_Free(self.data)
@@ -159,12 +169,15 @@ cdef class StringVector:
159169
ndarray ao
160170
size_t n
161171
object val
172+
if self.external_view_exists:
173+
raise ValueError("Vector.to_array() can only be called once")
162174

163175
ao = np.empty(self.data.n, dtype=np.object)
164176
for i in range(self.data.n):
165177
val = self.data.data[i]
166178
ao[i] = val
167179
self.data.m = self.data.n
180+
self.external_view_exists = True
168181
return ao
169182

170183
cdef inline void append(self, char * x):
@@ -181,8 +194,10 @@ cdef class ObjectVector:
181194
PyObject **data
182195
size_t n, m
183196
ndarray ao
197+
bint external_view_exists
184198

185199
def __cinit__(self):
200+
self.external_view_exists = False
186201
self.n = 0
187202
self.m = _INIT_VEC_CAP
188203
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
@@ -194,16 +209,19 @@ cdef class ObjectVector:
194209
cdef inline append(self, object o):
195210
if self.n == self.m:
196211
self.m = max(self.m * 2, _INIT_VEC_CAP)
197-
self.ao.resize(self.m)
212+
self.ao.resize(self.m, refcheck=False)
198213
self.data = <PyObject**> self.ao.data
199214

200215
Py_INCREF(o)
201216
self.data[self.n] = <PyObject*> o
202217
self.n += 1
203218

204219
def to_array(self):
220+
if self.external_view_exists:
221+
raise ValueError("Vector.to_array() can only be called once")
205222
self.ao.resize(self.n)
206223
self.m = self.n
224+
self.external_view_exists = True
207225
return self.ao
208226

209227

@@ -361,8 +379,7 @@ cdef class {{name}}HashTable(HashTable):
361379
self.table.vals[k] = count
362380

363381
if needs_resize(ud):
364-
with gil:
365-
uniques.resize()
382+
uniques.resize()
366383
append_data_{{dtype}}(ud, val)
367384
labels[i] = count
368385
count += 1
@@ -404,8 +421,7 @@ cdef class {{name}}HashTable(HashTable):
404421
self.table.vals[k] = count
405422

406423
if needs_resize(ud):
407-
with gil:
408-
uniques.resize()
424+
uniques.resize()
409425
append_data_{{dtype}}(ud, val)
410426
labels[i] = count
411427
count += 1
@@ -437,22 +453,19 @@ cdef class {{name}}HashTable(HashTable):
437453
if k == self.table.n_buckets:
438454
kh_put_{{dtype}}(self.table, val, &ret)
439455
if needs_resize(ud):
440-
with gil:
441-
uniques.resize()
456+
uniques.resize()
442457
append_data_{{dtype}}(ud, val)
443458
elif not seen_na:
444459
seen_na = 1
445460
if needs_resize(ud):
446-
with gil:
447-
uniques.resize()
461+
uniques.resize()
448462
append_data_{{dtype}}(ud, NAN)
449463
{{else}}
450464
k = kh_get_{{dtype}}(self.table, val)
451465
if k == self.table.n_buckets:
452466
kh_put_{{dtype}}(self.table, val, &ret)
453467
if needs_resize(ud):
454-
with gil:
455-
uniques.resize()
468+
uniques.resize()
456469
append_data_{{dtype}}(ud, val)
457470
{{endif}}
458471

0 commit comments

Comments
 (0)