From fea11a771eefc609c9b49fd9e9c6ac7065f3de7b Mon Sep 17 00:00:00 2001 From: mattip Date: Fri, 5 May 2017 17:08:28 +0300 Subject: [PATCH 1/3] COMPAT/TEST test, fix for unsafe Vector.resize(), which allows refcheck=False --- pandas/_libs/hashtable.pxd | 1 + pandas/_libs/hashtable.pyx | 15 ++++++++ pandas/_libs/hashtable_class_helper.pxi.in | 37 ++++++++++++++++---- pandas/tests/test_algos.py | 40 +++++++++++++--------- 4 files changed, 71 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 9b352ae1c003b..3366751af144d 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -52,6 +52,7 @@ cdef struct Int64VectorData: cdef class Int64Vector: cdef Int64VectorData *data cdef ndarray ao + cdef bint external_view_exists cdef resize(self) cpdef to_array(self) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c8aedcef77502..b56c6d70a7dd8 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -64,6 +64,12 @@ cdef class Factorizer: >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ + if self.uniques.external_view_exists: + uniques = ObjectVector() + data = self.uniques.to_array() + for v in data: + uniques.append(v) + self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, check_null) mask = (labels == na_sentinel) @@ -99,6 +105,15 @@ cdef class Int64Factorizer: def factorize(self, int64_t[:] values, sort=False, na_sentinel=-1, check_null=True): + """ + Factorize values with nans replaced by na_sentinel + >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) + array([ 0, 1, 20]) + """ + if self.uniques.external_view_exists: + uniques = Int64Vector() + uniques.extend(self.uniques.to_array()) + self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, check_null) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ce82dace40a9..b634869fc4ce5 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -71,6 +71,7 @@ cdef class {{name}}Vector: {{if dtype != 'int64'}} cdef: + bint external_view_exists {{name}}VectorData *data ndarray ao {{endif}} @@ -80,6 +81,7 @@ cdef class {{name}}Vector: sizeof({{name}}VectorData)) if not self.data: raise MemoryError() + self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype={{idtype}}) @@ -87,7 +89,7 @@ cdef class {{name}}Vector: cdef resize(self): self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) + self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{arg}}*> self.ao.data def __dealloc__(self): @@ -99,13 +101,19 @@ cdef class {{name}}Vector: return self.data.n cpdef to_array(self): - self.ao.resize(self.data.n) + if self.data.m != self.data.n: + if self.external_view_exists: + raise ValueError("should have raised on append(), m=%d n=%d" % (self.data.m, self.data.n)) + self.ao.resize(self.data.n, refcheck=False) self.data.m = self.data.n + self.external_view_exists = True return self.ao cdef inline void append(self, {{arg}} x): if needs_resize(self.data): + if self.external_view_exists: + raise ValueError("external reference but Vector.resize() needed") self.resize() append_data_{{dtype}}(self.data, x) @@ -120,15 +128,19 @@ cdef class StringVector: cdef: StringVectorData *data + bint external_view_exists def __cinit__(self): self.data = PyMem_Malloc( sizeof(StringVectorData)) if not self.data: raise MemoryError() + self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) + if not self.data.data: + raise MemoryError() cdef resize(self): cdef: @@ -138,9 +150,10 @@ cdef class StringVector: m = self.data.m self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - # TODO: can resize? orig_data = self.data.data self.data.data = malloc(self.data.m * sizeof(char *)) + if not self.data.data: + raise MemoryError() for i in range(m): self.data.data[i] = orig_data[i] @@ -164,6 +177,7 @@ cdef class StringVector: for i in range(self.data.n): val = self.data.data[i] ao[i] = val + self.external_view_exists = True self.data.m = self.data.n return ao @@ -181,8 +195,10 @@ cdef class ObjectVector: PyObject **data size_t n, m ndarray ao + bint external_view_exists def __cinit__(self): + self.external_view_exists = False self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) @@ -193,8 +209,10 @@ cdef class ObjectVector: cdef inline append(self, object o): if self.n == self.m: + if self.external_view_exists: + raise ValueError("external reference but Vector.resize() needed") self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m) + self.ao.resize(self.m, refcheck=False) self.data = self.ao.data Py_INCREF(o) @@ -202,8 +220,12 @@ cdef class ObjectVector: self.n += 1 def to_array(self): - self.ao.resize(self.n) - self.m = self.n + if self.m != self.n: + if self.external_view_exists: + raise ValueError("should have raised on append()") + self.ao.resize(self.n, refcheck=False) + self.m = self.n + self.external_view_exists = True return self.ao @@ -362,6 +384,9 @@ cdef class {{name}}HashTable(HashTable): if needs_resize(ud): with gil: + if uniques.external_view_exists: + raise ValueError("external reference to uniques held, " + "but Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) labels[i] = count diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 093730fb2478b..067c56443ec4e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -14,7 +14,7 @@ from pandas import compat from pandas._libs import (groupby as libgroupby, algos as libalgos, - hashtable) + hashtable as ht) from pandas._libs.hashtable import unique_label_indices from pandas.compat import lrange, range import pandas.core.algorithms as algos @@ -259,7 +259,7 @@ def test_factorize_nan(self): # rizer.factorize should not raise an exception if na_sentinel indexes # outside of reverse_indexer key = np.array([1, 2, 1, np.nan], dtype='O') - rizer = hashtable.Factorizer(len(key)) + rizer = ht.Factorizer(len(key)) for na_sentinel in (-1, 20): ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) expected = np.array([0, 1, 0, na_sentinel], dtype='int32') @@ -1049,14 +1049,14 @@ class TestHashTable(object): def test_lookup_nan(self): xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) - m = hashtable.Float64HashTable() + m = ht.Float64HashTable() m.map_locations(xs) tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) def test_lookup_overflow(self): xs = np.array([1, 2, 2**63], dtype=np.uint64) - m = hashtable.UInt64HashTable() + m = ht.UInt64HashTable() m.map_locations(xs) tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) @@ -1070,25 +1070,33 @@ def test_vector_resize(self): # Test for memory errors after internal vector # reallocations (pull request #7157) - def _test_vector_resize(htable, uniques, dtype, nvals): + def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes): vals = np.array(np.random.randn(1000), dtype=dtype) # get_labels appends to the vector htable.get_labels(vals[:nvals], uniques, 0, -1) - # to_array resizes the vector - uniques.to_array() - htable.get_labels(vals, uniques, 0, -1) + # to_array may resize the vector + tmp = uniques.to_array() + oldshape = tmp.shape + if safely_resizes: + htable.get_labels(vals, uniques, 0, -1) + else: + with pytest.raises(ValueError) as excinfo: + htable.get_labels(vals, uniques, 0, -1) + assert str(excinfo.value).startswith('external reference') + uniques.to_array() # should not raise + assert tmp.shape == oldshape test_cases = [ - (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), - (hashtable.StringHashTable, hashtable.ObjectVector, 'object'), - (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), - (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'), - (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')] + (ht.PyObjectHashTable, ht.ObjectVector, 'object', False), + (ht.StringHashTable, ht.ObjectVector, 'object', True), + (ht.Float64HashTable, ht.Float64Vector, 'float64', False), + (ht.Int64HashTable, ht.Int64Vector, 'int64', False), + (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)] - for (tbl, vect, dtype) in test_cases: + for (tbl, vect, dtype, safely_resizes) in test_cases: # resizing to empty is a special case - _test_vector_resize(tbl(), vect(), dtype, 0) - _test_vector_resize(tbl(), vect(), dtype, 10) + _test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes) + _test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes) def test_quantile(): From 085baf9b9c74876e61ee404c8dad8a8d910a55ad Mon Sep 17 00:00:00 2001 From: mattip Date: Wed, 10 May 2017 14:54:29 +0300 Subject: [PATCH 2/3] COMPAT/TEST improve error msg, document test as per review --- pandas/_libs/hashtable_class_helper.pxi.in | 5 +++-- pandas/tests/test_algos.py | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b634869fc4ce5..d1cd7b1f41768 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -103,9 +103,10 @@ cdef class {{name}}Vector: cpdef to_array(self): if self.data.m != self.data.n: if self.external_view_exists: - raise ValueError("should have raised on append(), m=%d n=%d" % (self.data.m, self.data.n)) + # should never happen + raise ValueError("should have raised on append()") self.ao.resize(self.data.n, refcheck=False) - self.data.m = self.data.n + self.data.m = self.data.n self.external_view_exists = True return self.ao diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 067c56443ec4e..351e646cbb0b2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1072,18 +1072,20 @@ def test_vector_resize(self): def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes): vals = np.array(np.random.randn(1000), dtype=dtype) - # get_labels appends to the vector + # get_labels may append to uniques htable.get_labels(vals[:nvals], uniques, 0, -1) - # to_array may resize the vector + # to_array() set an external_view_exists flag on uniques. tmp = uniques.to_array() oldshape = tmp.shape + # subsequent get_labels() calls can no longer append to it + # (for all but StringHashTables + ObjectVector) if safely_resizes: htable.get_labels(vals, uniques, 0, -1) else: with pytest.raises(ValueError) as excinfo: htable.get_labels(vals, uniques, 0, -1) assert str(excinfo.value).startswith('external reference') - uniques.to_array() # should not raise + uniques.to_array() # should not raise here assert tmp.shape == oldshape test_cases = [ From 247cc75cfbeb58396c4bb119bdb5f0d1ff2463aa Mon Sep 17 00:00:00 2001 From: mattip Date: Wed, 10 May 2017 16:28:40 +0300 Subject: [PATCH 3/3] COMPAT/TEST unify interfaces as per review --- pandas/_libs/hashtable.pyx | 4 +--- pandas/_libs/hashtable_class_helper.pxi.in | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index b56c6d70a7dd8..101e2c031f26e 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -66,9 +66,7 @@ cdef class Factorizer: """ if self.uniques.external_view_exists: uniques = ObjectVector() - data = self.uniques.to_array() - for v in data: - uniques.append(v) + uniques.extend(self.uniques.to_array()) self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, check_null) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index d1cd7b1f41768..b80a592669eca 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -189,6 +189,9 @@ cdef class StringVector: append_data_string(self.data, x) + cdef extend(self, ndarray[:] x): + for i in range(len(x)): + self.append(x[i]) cdef class ObjectVector: @@ -229,6 +232,9 @@ cdef class ObjectVector: self.external_view_exists = True return self.ao + cdef extend(self, ndarray[:] x): + for i in range(len(x)): + self.append(x[i]) #---------------------------------------------------------------------- # HashTable