-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
COMPAT/TEST test, fix for unsafe Vector.resize(), which allows refche… #16258
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,12 @@ cdef class Factorizer: | |
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) | ||
array([ 0, 1, 20]) | ||
""" | ||
if self.uniques.external_view_exists: | ||
uniques = ObjectVector() | ||
data = self.uniques.to_array() | ||
for v in data: | ||
uniques.append(v) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for consistency maybe we should add |
||
self.uniques = uniques | ||
labels = self.table.get_labels(values, self.uniques, | ||
self.count, na_sentinel, check_null) | ||
mask = (labels == na_sentinel) | ||
|
@@ -99,6 +105,15 @@ cdef class Int64Factorizer: | |
|
||
def factorize(self, int64_t[:] values, sort=False, | ||
na_sentinel=-1, check_null=True): | ||
""" | ||
Factorize values with nans replaced by na_sentinel | ||
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) | ||
array([ 0, 1, 20]) | ||
""" | ||
if self.uniques.external_view_exists: | ||
uniques = Int64Vector() | ||
uniques.extend(self.uniques.to_array()) | ||
self.uniques = uniques | ||
labels = self.table.get_labels(values, self.uniques, | ||
self.count, na_sentinel, | ||
check_null) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,7 @@ cdef class {{name}}Vector: | |
|
||
{{if dtype != 'int64'}} | ||
cdef: | ||
bint external_view_exists | ||
{{name}}VectorData *data | ||
ndarray ao | ||
{{endif}} | ||
|
@@ -80,14 +81,15 @@ cdef class {{name}}Vector: | |
sizeof({{name}}VectorData)) | ||
if not self.data: | ||
raise MemoryError() | ||
self.external_view_exists = False | ||
self.data.n = 0 | ||
self.data.m = _INIT_VEC_CAP | ||
self.ao = np.empty(self.data.m, dtype={{idtype}}) | ||
self.data.data = <{{arg}}*> self.ao.data | ||
|
||
cdef resize(self): | ||
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) | ||
self.ao.resize(self.data.m) | ||
self.ao.resize(self.data.m, refcheck=False) | ||
self.data.data = <{{arg}}*> self.ao.data | ||
|
||
def __dealloc__(self): | ||
|
@@ -99,13 +101,19 @@ cdef class {{name}}Vector: | |
return self.data.n | ||
|
||
cpdef to_array(self): | ||
self.ao.resize(self.data.n) | ||
if self.data.m != self.data.n: | ||
if self.external_view_exists: | ||
raise ValueError("should have raised on append(), m=%d n=%d" % (self.data.m, self.data.n)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you use .format() syntax and wrap the line |
||
self.ao.resize(self.data.n, refcheck=False) | ||
self.data.m = self.data.n | ||
self.external_view_exists = True | ||
return self.ao | ||
|
||
cdef inline void append(self, {{arg}} x): | ||
|
||
if needs_resize(self.data): | ||
if self.external_view_exists: | ||
raise ValueError("external reference but Vector.resize() needed") | ||
self.resize() | ||
|
||
append_data_{{dtype}}(self.data, x) | ||
|
@@ -120,15 +128,19 @@ cdef class StringVector: | |
|
||
cdef: | ||
StringVectorData *data | ||
bint external_view_exists | ||
|
||
def __cinit__(self): | ||
self.data = <StringVectorData *>PyMem_Malloc( | ||
sizeof(StringVectorData)) | ||
if not self.data: | ||
raise MemoryError() | ||
self.external_view_exists = False | ||
self.data.n = 0 | ||
self.data.m = _INIT_VEC_CAP | ||
self.data.data = <char **> malloc(self.data.m * sizeof(char *)) | ||
if not self.data.data: | ||
raise MemoryError() | ||
|
||
cdef resize(self): | ||
cdef: | ||
|
@@ -138,9 +150,10 @@ cdef class StringVector: | |
m = self.data.m | ||
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) | ||
|
||
# TODO: can resize? | ||
orig_data = self.data.data | ||
self.data.data = <char **> malloc(self.data.m * sizeof(char *)) | ||
if not self.data.data: | ||
raise MemoryError() | ||
for i in range(m): | ||
self.data.data[i] = orig_data[i] | ||
|
||
|
@@ -164,6 +177,7 @@ cdef class StringVector: | |
for i in range(self.data.n): | ||
val = self.data.data[i] | ||
ao[i] = val | ||
self.external_view_exists = True | ||
self.data.m = self.data.n | ||
return ao | ||
|
||
|
@@ -181,8 +195,10 @@ cdef class ObjectVector: | |
PyObject **data | ||
size_t n, m | ||
ndarray ao | ||
bint external_view_exists | ||
|
||
def __cinit__(self): | ||
self.external_view_exists = False | ||
self.n = 0 | ||
self.m = _INIT_VEC_CAP | ||
self.ao = np.empty(_INIT_VEC_CAP, dtype=object) | ||
|
@@ -193,17 +209,23 @@ cdef class ObjectVector: | |
|
||
cdef inline append(self, object o): | ||
if self.n == self.m: | ||
if self.external_view_exists: | ||
raise ValueError("external reference but Vector.resize() needed") | ||
self.m = max(self.m * 2, _INIT_VEC_CAP) | ||
self.ao.resize(self.m) | ||
self.ao.resize(self.m, refcheck=False) | ||
self.data = <PyObject**> self.ao.data | ||
|
||
Py_INCREF(o) | ||
self.data[self.n] = <PyObject*> o | ||
self.n += 1 | ||
|
||
def to_array(self): | ||
self.ao.resize(self.n) | ||
self.m = self.n | ||
if self.m != self.n: | ||
if self.external_view_exists: | ||
raise ValueError("should have raised on append()") | ||
self.ao.resize(self.n, refcheck=False) | ||
self.m = self.n | ||
self.external_view_exists = True | ||
return self.ao | ||
|
||
|
||
|
@@ -362,6 +384,9 @@ cdef class {{name}}HashTable(HashTable): | |
|
||
if needs_resize(ud): | ||
with gil: | ||
if uniques.external_view_exists: | ||
raise ValueError("external reference to uniques held, " | ||
"but Vector.resize() needed") | ||
uniques.resize() | ||
append_data_{{dtype}}(ud, val) | ||
labels[i] = count | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ | |
|
||
from pandas import compat | ||
from pandas._libs import (groupby as libgroupby, algos as libalgos, | ||
hashtable) | ||
hashtable as ht) | ||
from pandas._libs.hashtable import unique_label_indices | ||
from pandas.compat import lrange, range | ||
import pandas.core.algorithms as algos | ||
|
@@ -259,7 +259,7 @@ def test_factorize_nan(self): | |
# rizer.factorize should not raise an exception if na_sentinel indexes | ||
# outside of reverse_indexer | ||
key = np.array([1, 2, 1, np.nan], dtype='O') | ||
rizer = hashtable.Factorizer(len(key)) | ||
rizer = ht.Factorizer(len(key)) | ||
for na_sentinel in (-1, 20): | ||
ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) | ||
expected = np.array([0, 1, 0, na_sentinel], dtype='int32') | ||
|
@@ -1049,14 +1049,14 @@ class TestHashTable(object): | |
|
||
def test_lookup_nan(self): | ||
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) | ||
m = hashtable.Float64HashTable() | ||
m = ht.Float64HashTable() | ||
m.map_locations(xs) | ||
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), | ||
dtype=np.int64)) | ||
|
||
def test_lookup_overflow(self): | ||
xs = np.array([1, 2, 2**63], dtype=np.uint64) | ||
m = hashtable.UInt64HashTable() | ||
m = ht.UInt64HashTable() | ||
m.map_locations(xs) | ||
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), | ||
dtype=np.int64)) | ||
|
@@ -1070,25 +1070,33 @@ def test_vector_resize(self): | |
# Test for memory errors after internal vector | ||
# reallocations (pull request #7157) | ||
|
||
def _test_vector_resize(htable, uniques, dtype, nvals): | ||
def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes): | ||
vals = np.array(np.random.randn(1000), dtype=dtype) | ||
# get_labels appends to the vector | ||
htable.get_labels(vals[:nvals], uniques, 0, -1) | ||
# to_array resizes the vector | ||
uniques.to_array() | ||
htable.get_labels(vals, uniques, 0, -1) | ||
# to_array may resize the vector | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you put a 1-line comment here on wheat you are checking |
||
tmp = uniques.to_array() | ||
oldshape = tmp.shape | ||
if safely_resizes: | ||
htable.get_labels(vals, uniques, 0, -1) | ||
else: | ||
with pytest.raises(ValueError) as excinfo: | ||
htable.get_labels(vals, uniques, 0, -1) | ||
assert str(excinfo.value).startswith('external reference') | ||
uniques.to_array() # should not raise | ||
assert tmp.shape == oldshape | ||
|
||
test_cases = [ | ||
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), | ||
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'), | ||
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), | ||
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'), | ||
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')] | ||
(ht.PyObjectHashTable, ht.ObjectVector, 'object', False), | ||
(ht.StringHashTable, ht.ObjectVector, 'object', True), | ||
(ht.Float64HashTable, ht.Float64Vector, 'float64', False), | ||
(ht.Int64HashTable, ht.Int64Vector, 'int64', False), | ||
(ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)] | ||
|
||
for (tbl, vect, dtype) in test_cases: | ||
for (tbl, vect, dtype, safely_resizes) in test_cases: | ||
# resizing to empty is a special case | ||
_test_vector_resize(tbl(), vect(), dtype, 0) | ||
_test_vector_resize(tbl(), vect(), dtype, 10) | ||
_test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes) | ||
_test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes) | ||
|
||
|
||
def test_quantile(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are you only doing this for Int64 for some reason?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the other Vector classes are defined in pandas/_libs/hashtable_class_helper.pxi.in, for some reason the original code defines this class here specificially. So I added the
cdef bint external_view_exists
where the other classes are defined