Skip to content

Commit 8f3dd75

Browse files
committed
COMPAT/PERF add checks, adapt test, fix data.m after realloc
1 parent 2907506 commit 8f3dd75

File tree

2 files changed

+45
-14
lines changed

2 files changed

+45
-14
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

+22-2
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,15 @@ cdef class {{name}}Vector:
8585
self.data.n = 0
8686
self.data.m = _INIT_VEC_CAP
8787
self.data.data = <{{arg}}*> malloc(self.data.m * sizeof({{arg}}))
88+
if not self.data.data:
89+
raise MemoryError()
8890

8991
cdef void resize(self) nogil:
90-
# TODO: handle failure to allocate
9192
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
9293
self.data.data = <{{arg}}*> realloc(self.data.data, self.data.m * sizeof({{arg}}))
94+
if not self.data.data:
95+
with gil:
96+
raise MemoryError()
9397

9498
def __dealloc__(self):
9599
if self.data is not NULL:
@@ -110,6 +114,7 @@ cdef class {{name}}Vector:
110114
raise ValueError("Vector.to_array() can only be called once")
111115

112116
self.data.data = <{{arg}}*> realloc(self.data.data, self.data.n * sizeof({{arg}}))
117+
self.data.m = self.data.n
113118
self.external_view_exists = True
114119
shape[0] = self.data.n
115120
ao = cnp.PyArray_SimpleNewFromData(1, shape, {{idtype}}, <void*>self.data.data)
@@ -120,6 +125,8 @@ cdef class {{name}}Vector:
120125
cdef inline void append(self, {{arg}} x):
121126

122127
if needs_resize(self.data):
128+
if self.external_view_exists:
129+
raise ValueError("external reference but Vector.resize() needed")
123130
self.resize()
124131

125132
append_data_{{dtype}}(self.data, x)
@@ -145,12 +152,17 @@ cdef class StringVector:
145152
self.data.n = 0
146153
self.data.m = _INIT_VEC_CAP
147154
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
155+
if not self.data.data:
156+
raise MemoryError()
148157

149158
cdef void resize(self) nogil:
150159
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
151160

152161
self.data.data = <char **> realloc(self.data.data,
153162
self.data.m * sizeof(char *))
163+
if not self.data.data:
164+
with gil:
165+
raise MemoryError()
154166

155167
def __dealloc__(self):
156168
if self.data is not NULL:
@@ -183,6 +195,8 @@ cdef class StringVector:
183195
cdef inline void append(self, char * x):
184196

185197
if needs_resize(self.data):
198+
if self.external_view_exists:
199+
raise ValueError("external reference but Vector.resize() needed")
186200
self.resize()
187201

188202
append_data_string(self.data, x)
@@ -208,6 +222,8 @@ cdef class ObjectVector:
208222

209223
cdef inline append(self, object o):
210224
if self.n == self.m:
225+
if self.external_view_exists:
226+
raise ValueError("external reference but Vector.resize() needed")
211227
self.m = max(self.m * 2, _INIT_VEC_CAP)
212228
self.ao.resize(self.m, refcheck=False)
213229
self.data = <PyObject**> self.ao.data
@@ -219,7 +235,7 @@ cdef class ObjectVector:
219235
def to_array(self):
220236
if self.external_view_exists:
221237
raise ValueError("Vector.to_array() can only be called once")
222-
self.ao.resize(self.n)
238+
self.ao.resize(self.n, refcheck=False)
223239
self.m = self.n
224240
self.external_view_exists = True
225241
return self.ao
@@ -380,6 +396,7 @@ cdef class {{name}}HashTable(HashTable):
380396

381397
if needs_resize(ud):
382398
uniques.resize()
399+
ud = uniques.data
383400
append_data_{{dtype}}(ud, val)
384401
labels[i] = count
385402
count += 1
@@ -454,18 +471,21 @@ cdef class {{name}}HashTable(HashTable):
454471
kh_put_{{dtype}}(self.table, val, &ret)
455472
if needs_resize(ud):
456473
uniques.resize()
474+
ud = uniques.data
457475
append_data_{{dtype}}(ud, val)
458476
elif not seen_na:
459477
seen_na = 1
460478
if needs_resize(ud):
461479
uniques.resize()
480+
ud = uniques.data
462481
append_data_{{dtype}}(ud, NAN)
463482
{{else}}
464483
k = kh_get_{{dtype}}(self.table, val)
465484
if k == self.table.n_buckets:
466485
kh_put_{{dtype}}(self.table, val, &ret)
467486
if needs_resize(ud):
468487
uniques.resize()
488+
ud = uniques.data
469489
append_data_{{dtype}}(ud, val)
470490
{{endif}}
471491

pandas/tests/test_algos.py

+23-12
Original file line numberDiff line numberDiff line change
@@ -1070,25 +1070,36 @@ def test_vector_resize(self):
10701070
# Test for memory errors after internal vector
10711071
# reallocations (pull request #7157)
10721072

1073-
def _test_vector_resize(htable, uniques, dtype, nvals):
1073+
def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
10741074
vals = np.array(np.random.randn(1000), dtype=dtype)
10751075
# get_labels appends to the vector
10761076
htable.get_labels(vals[:nvals], uniques, 0, -1)
1077-
# to_array resizes the vector
1078-
uniques.to_array()
1079-
htable.get_labels(vals, uniques, 0, -1)
1077+
# to_array may resize the vector
1078+
tmp = uniques.to_array()
1079+
oldshape = tmp.shape
1080+
if safely_resizes:
1081+
htable.get_labels(vals, uniques, 0, -1)
1082+
else:
1083+
with pytest.raises(ValueError) as excinfo:
1084+
htable.get_labels(vals, uniques, 0, -1)
1085+
assert excinfo.value.message == 'external reference but Vector.resize() needed'
1086+
with pytest.raises(ValueError) as excinfo:
1087+
# cannot resize, tmp is holding a reference
1088+
tmp2 = uniques.to_array()
1089+
assert excinfo.value.message == 'Vector.to_array() can only be called once'
1090+
assert tmp.shape == oldshape
10801091

10811092
test_cases = [
1082-
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
1083-
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
1084-
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
1085-
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
1086-
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]
1093+
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object', False),
1094+
(hashtable.StringHashTable, hashtable.ObjectVector, 'object', True),
1095+
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64', True),
1096+
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64', True),
1097+
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64', True)]
10871098

1088-
for (tbl, vect, dtype) in test_cases:
1099+
for (tbl, vect, dtype, safely_resizes) in test_cases:
10891100
# resizing to empty is a special case
1090-
_test_vector_resize(tbl(), vect(), dtype, 0)
1091-
_test_vector_resize(tbl(), vect(), dtype, 10)
1101+
_test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes)
1102+
_test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes)
10921103

10931104

10941105
def test_quantile():

0 commit comments

Comments
 (0)