Skip to content

COMPAT/TEST test, fix for unsafe Vector.resize(), which allows refche… #16258

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 11, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ cdef struct Int64VectorData:
cdef class Int64Vector:
cdef Int64VectorData *data
cdef ndarray ao
cdef bint external_view_exists
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you only doing this for Int64 for some reason?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the other Vector classes are defined in pandas/_libs/hashtable_class_helper.pxi.in, for some reason the original code defines this class here specificially. So I added the cdef bint external_view_exists where the other classes are defined


cdef resize(self)
cpdef to_array(self)
Expand Down
15 changes: 15 additions & 0 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ cdef class Factorizer:
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
if self.uniques.external_view_exists:
uniques = ObjectVector()
data = self.uniques.to_array()
for v in data:
uniques.append(v)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for consistency maybe we should add .extend to String/Object Vectors. can you add?

self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel, check_null)
mask = (labels == na_sentinel)
Expand Down Expand Up @@ -99,6 +105,15 @@ cdef class Int64Factorizer:

def factorize(self, int64_t[:] values, sort=False,
na_sentinel=-1, check_null=True):
"""
Factorize values with nans replaced by na_sentinel
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
if self.uniques.external_view_exists:
uniques = Int64Vector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel,
check_null)
Expand Down
37 changes: 31 additions & 6 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ cdef class {{name}}Vector:

{{if dtype != 'int64'}}
cdef:
bint external_view_exists
{{name}}VectorData *data
ndarray ao
{{endif}}
Expand All @@ -80,14 +81,15 @@ cdef class {{name}}Vector:
sizeof({{name}}VectorData))
if not self.data:
raise MemoryError()
self.external_view_exists = False
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.ao = np.empty(self.data.m, dtype={{idtype}})
self.data.data = <{{arg}}*> self.ao.data

cdef resize(self):
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.m)
self.ao.resize(self.data.m, refcheck=False)
self.data.data = <{{arg}}*> self.ao.data

def __dealloc__(self):
Expand All @@ -99,13 +101,19 @@ cdef class {{name}}Vector:
return self.data.n

cpdef to_array(self):
self.ao.resize(self.data.n)
if self.data.m != self.data.n:
if self.external_view_exists:
raise ValueError("should have raised on append(), m=%d n=%d" % (self.data.m, self.data.n))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use .format() syntax and wrap the line

self.ao.resize(self.data.n, refcheck=False)
self.data.m = self.data.n
self.external_view_exists = True
return self.ao

cdef inline void append(self, {{arg}} x):

if needs_resize(self.data):
if self.external_view_exists:
raise ValueError("external reference but Vector.resize() needed")
self.resize()

append_data_{{dtype}}(self.data, x)
Expand All @@ -120,15 +128,19 @@ cdef class StringVector:

cdef:
StringVectorData *data
bint external_view_exists

def __cinit__(self):
self.data = <StringVectorData *>PyMem_Malloc(
sizeof(StringVectorData))
if not self.data:
raise MemoryError()
self.external_view_exists = False
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()

cdef resize(self):
cdef:
Expand All @@ -138,9 +150,10 @@ cdef class StringVector:
m = self.data.m
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)

# TODO: can resize?
orig_data = self.data.data
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()
for i in range(m):
self.data.data[i] = orig_data[i]

Expand All @@ -164,6 +177,7 @@ cdef class StringVector:
for i in range(self.data.n):
val = self.data.data[i]
ao[i] = val
self.external_view_exists = True
self.data.m = self.data.n
return ao

Expand All @@ -181,8 +195,10 @@ cdef class ObjectVector:
PyObject **data
size_t n, m
ndarray ao
bint external_view_exists

def __cinit__(self):
self.external_view_exists = False
self.n = 0
self.m = _INIT_VEC_CAP
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
Expand All @@ -193,17 +209,23 @@ cdef class ObjectVector:

cdef inline append(self, object o):
if self.n == self.m:
if self.external_view_exists:
raise ValueError("external reference but Vector.resize() needed")
self.m = max(self.m * 2, _INIT_VEC_CAP)
self.ao.resize(self.m)
self.ao.resize(self.m, refcheck=False)
self.data = <PyObject**> self.ao.data

Py_INCREF(o)
self.data[self.n] = <PyObject*> o
self.n += 1

def to_array(self):
self.ao.resize(self.n)
self.m = self.n
if self.m != self.n:
if self.external_view_exists:
raise ValueError("should have raised on append()")
self.ao.resize(self.n, refcheck=False)
self.m = self.n
self.external_view_exists = True
return self.ao


Expand Down Expand Up @@ -362,6 +384,9 @@ cdef class {{name}}HashTable(HashTable):

if needs_resize(ud):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to uniques held, "
"but Vector.resize() needed")
uniques.resize()
append_data_{{dtype}}(ud, val)
labels[i] = count
Expand Down
40 changes: 24 additions & 16 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from pandas import compat
from pandas._libs import (groupby as libgroupby, algos as libalgos,
hashtable)
hashtable as ht)
from pandas._libs.hashtable import unique_label_indices
from pandas.compat import lrange, range
import pandas.core.algorithms as algos
Expand Down Expand Up @@ -259,7 +259,7 @@ def test_factorize_nan(self):
# rizer.factorize should not raise an exception if na_sentinel indexes
# outside of reverse_indexer
key = np.array([1, 2, 1, np.nan], dtype='O')
rizer = hashtable.Factorizer(len(key))
rizer = ht.Factorizer(len(key))
for na_sentinel in (-1, 20):
ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
Expand Down Expand Up @@ -1049,14 +1049,14 @@ class TestHashTable(object):

def test_lookup_nan(self):
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
m = hashtable.Float64HashTable()
m = ht.Float64HashTable()
m.map_locations(xs)
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
dtype=np.int64))

def test_lookup_overflow(self):
xs = np.array([1, 2, 2**63], dtype=np.uint64)
m = hashtable.UInt64HashTable()
m = ht.UInt64HashTable()
m.map_locations(xs)
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
dtype=np.int64))
Expand All @@ -1070,25 +1070,33 @@ def test_vector_resize(self):
# Test for memory errors after internal vector
# reallocations (pull request #7157)

def _test_vector_resize(htable, uniques, dtype, nvals):
def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
vals = np.array(np.random.randn(1000), dtype=dtype)
# get_labels appends to the vector
htable.get_labels(vals[:nvals], uniques, 0, -1)
# to_array resizes the vector
uniques.to_array()
htable.get_labels(vals, uniques, 0, -1)
# to_array may resize the vector
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you put a 1-line comment here on wheat you are checking

tmp = uniques.to_array()
oldshape = tmp.shape
if safely_resizes:
htable.get_labels(vals, uniques, 0, -1)
else:
with pytest.raises(ValueError) as excinfo:
htable.get_labels(vals, uniques, 0, -1)
assert str(excinfo.value).startswith('external reference')
uniques.to_array() # should not raise
assert tmp.shape == oldshape

test_cases = [
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]
(ht.PyObjectHashTable, ht.ObjectVector, 'object', False),
(ht.StringHashTable, ht.ObjectVector, 'object', True),
(ht.Float64HashTable, ht.Float64Vector, 'float64', False),
(ht.Int64HashTable, ht.Int64Vector, 'int64', False),
(ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]

for (tbl, vect, dtype) in test_cases:
for (tbl, vect, dtype, safely_resizes) in test_cases:
# resizing to empty is a special case
_test_vector_resize(tbl(), vect(), dtype, 0)
_test_vector_resize(tbl(), vect(), dtype, 10)
_test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes)
_test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes)


def test_quantile():
Expand Down