From e221dd84baa887c582ad0e45bdbf2777540d3222 Mon Sep 17 00:00:00 2001 From: mattip Date: Thu, 27 Apr 2017 08:13:15 +0300 Subject: [PATCH 1/3] COMPAT: add refcheck kwarg and percolate out to app-level for PyPy --- pandas/_libs/hashtable.pxd | 8 +-- pandas/_libs/hashtable.pyx | 8 +-- pandas/_libs/hashtable_class_helper.pxi.in | 75 ++++++++++++---------- pandas/_libs/intervaltree.pxi.in | 51 ++++++++------- pandas/core/algorithms.py | 4 +- 5 files changed, 78 insertions(+), 68 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 9b352ae1c003b..65d8c5eff1b7b 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -53,7 +53,7 @@ cdef class Int64Vector: cdef Int64VectorData *data cdef ndarray ao - cdef resize(self) - cpdef to_array(self) - cdef inline void append(self, int64_t x) - cdef extend(self, int64_t[:] x) + cdef resize(self, refcheck=*) + cpdef to_array(self, refcheck=*) + cdef inline void append(self, int64_t x, refcheck=*) + cdef extend(self, int64_t[:] x, refcheck=*) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c8aedcef77502..4ad658c6b16c4 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -101,14 +101,14 @@ cdef class Int64Factorizer: na_sentinel=-1, check_null=True): labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, - check_null) + check_null, refcheck=False) # sort on if sort: if labels.dtype != np.intp: labels = labels.astype(np.intp) - sorter = self.uniques.to_array().argsort() + sorter = self.uniques.to_array(refcheck=False).argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) @@ -142,12 +142,12 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels): if ret != 0: if needs_resize(ud): with gil: - idx.resize() + idx.resize(refcheck=False) append_data_int64(ud, i) kh_destroy_int64(table) - arr = idx.to_array() + arr = idx.to_array(refcheck=False) arr = arr[labels[arr].argsort()] return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ce82dace40a9..f7e18178d80db 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -85,10 +85,11 @@ cdef class {{name}}Vector: self.ao = np.empty(self.data.m, dtype={{idtype}}) self.data.data = <{{arg}}*> self.ao.data - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) + cdef resize(self, refcheck=True): + m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(m, refcheck=refcheck) # could raise, change m later self.data.data = <{{arg}}*> self.ao.data + self.data.m = m def __dealloc__(self): if self.data is not NULL: @@ -98,21 +99,21 @@ cdef class {{name}}Vector: def __len__(self): return self.data.n - cpdef to_array(self): - self.ao.resize(self.data.n) + cpdef to_array(self, refcheck=True): + self.ao.resize(self.data.n, refcheck=refcheck) self.data.m = self.data.n + self.data.data = <{{arg}}*> self.ao.data return self.ao - cdef inline void append(self, {{arg}} x): + cdef inline void append(self, {{arg}} x, refcheck=True): if needs_resize(self.data): - self.resize() - + self.resize(refcheck=refcheck) append_data_{{dtype}}(self.data, x) - cdef extend(self, {{arg}}[:] x): + cdef extend(self, {{arg}}[:] x, refcheck=True): for i in range(len(x)): - self.append(x[i]) + self.append(x[i], refcheck=refcheck) {{endfor}} @@ -130,11 +131,12 @@ cdef class StringVector: self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) - cdef resize(self): + cdef resize(self, refcheck=True): cdef: char **orig_data size_t i, m + # refcheck ignored, for compatibility only m = self.data.m self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) @@ -154,12 +156,13 @@ cdef class StringVector: def __len__(self): return self.data.n - def to_array(self): + def to_array(self, refcheck=True): cdef: ndarray ao size_t n object val + # refcheck ignored, for compatibility only ao = np.empty(self.data.n, dtype=np.object) for i in range(self.data.n): val = self.data.data[i] @@ -167,10 +170,10 @@ cdef class StringVector: self.data.m = self.data.n return ao - cdef inline void append(self, char * x): + cdef inline void append(self, char * x, refcheck=True): if needs_resize(self.data): - self.resize() + self.resize(refcheck=refcheck) append_data_string(self.data, x) @@ -191,18 +194,18 @@ cdef class ObjectVector: def __len__(self): return self.n - cdef inline append(self, object o): + cdef inline append(self, object o, refcheck=True): if self.n == self.m: self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m) + self.ao.resize(self.m, refcheck=refcheck) self.data = self.ao.data Py_INCREF(o) self.data[self.n] = o self.n += 1 - def to_array(self): - self.ao.resize(self.n) + def to_array(self, refcheck=True): + self.ao.resize(self.n, refcheck=refcheck) self.m = self.n return self.ao @@ -324,13 +327,13 @@ cdef class {{name}}HashTable(HashTable): def factorize(self, {{dtype}}_t values): uniques = {{name}}Vector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels + labels = self.get_labels(values, uniques, 0, 0, refcheck=False) + return uniques.to_array(refcheck=False), labels @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): + bint check_null=True, bint refcheck=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -362,7 +365,7 @@ cdef class {{name}}HashTable(HashTable): if needs_resize(ud): with gil: - uniques.resize() + uniques.resize(refcheck=refcheck) append_data_{{dtype}}(ud, val) labels[i] = count count += 1 @@ -405,12 +408,12 @@ cdef class {{name}}HashTable(HashTable): if needs_resize(ud): with gil: - uniques.resize() + uniques.resize(refcheck=False) append_data_{{dtype}}(ud, val) labels[i] = count count += 1 - arr_uniques = uniques.to_array() + arr_uniques = uniques.to_array(refcheck=False) return np.asarray(labels), arr_uniques @@ -438,13 +441,13 @@ cdef class {{name}}HashTable(HashTable): kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): with gil: - uniques.resize() + uniques.resize(refcheck=False) append_data_{{dtype}}(ud, val) elif not seen_na: seen_na = 1 if needs_resize(ud): with gil: - uniques.resize() + uniques.resize(refcheck=False) append_data_{{dtype}}(ud, NAN) {{else}} k = kh_get_{{dtype}}(self.table, val) @@ -452,11 +455,11 @@ cdef class {{name}}HashTable(HashTable): kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): with gil: - uniques.resize() + uniques.resize(refcheck=False) append_data_{{dtype}}(ud, val) {{endif}} - return uniques.to_array() + return uniques.to_array(refcheck=False) {{endfor}} @@ -571,12 +574,12 @@ cdef class StringHashTable(HashTable): uniques = ObjectVector() for i in range(count): uniques.append(values[uindexer[i]]) - return uniques.to_array() + return uniques.to_array(refcheck=False) def factorize(self, ndarray[object] values): uniques = ObjectVector() - labels = self.get_labels(values, uniques, 0, 0) - return uniques.to_array(), labels + labels = self.get_labels(values, uniques, 0, 0, refcheck=False) + return uniques.to_array(refcheck=False), labels @cython.boundscheck(False) def lookup(self, ndarray[object] values): @@ -642,7 +645,7 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=1): + bint check_null=1, bint refcheck=1): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -654,6 +657,8 @@ cdef class StringHashTable(HashTable): char **vecs khiter_t k + # refcheck ignored, for compatibility only + # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) @@ -811,11 +816,11 @@ cdef class PyObjectHashTable(HashTable): seen_na = 1 uniques.append(nan) - return uniques.to_array() + return uniques.to_array(refcheck=False) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): + bint check_null=True, bint refcheck=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -824,6 +829,8 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k + # refcheck ignored, for compatibility only + labels = np.empty(n, dtype=np.int64) for i in range(n): diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 4fa0d6d156fa2..09b71d6ef6fdf 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -12,7 +12,7 @@ cimport cython cimport numpy as cnp cnp.import_array() -from hashtable cimport Int64Vector, Int64VectorData +from hashtable cimport Int64Vector ctypedef fused scalar_t: @@ -91,10 +91,10 @@ cdef class IntervalTree(IntervalMixin): the given scalar key """ result = Int64Vector() - self.root.query(result, key) + self.root.query(result, key, False) if not result.data.n: raise KeyError(key) - return result.to_array() + return result.to_array(refcheck=False) def _get_partial_overlap(self, key_left, key_right, side): """Return all positions corresponding to intervals with the given side @@ -137,14 +137,14 @@ cdef class IntervalTree(IntervalMixin): result = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + self.root.query(result, target[i], False) if result.data.n == old_len: - result.append(-1) + result.append(-1, refcheck=False) elif result.data.n > old_len + 1: raise KeyError( 'indexer does not intersect a unique set of intervals') old_len = result.data.n - return result.to_array() + return result.to_array(refcheck=False) def get_indexer_non_unique(self, scalar_t[:] target): """Return the positions corresponding to intervals that overlap with @@ -159,12 +159,12 @@ cdef class IntervalTree(IntervalMixin): missing = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + self.root.query(result, target[i], False) if result.data.n == old_len: - result.append(-1) - missing.append(i) + result.append(-1, refcheck=False) + missing.append(i, refcheck=False) old_len = result.data.n - return result.to_array(), missing.to_array() + return result.to_array(refcheck=False), missing.to_array(refcheck=False) def __repr__(self): return (' self.pivot: values = self.center_right_values indices = self.center_right_indices for i in range(self.n_center - 1, -1, -1): if not point {{cmp_right}} values[i]: break - result.append(indices[i]) + result.append(indices[i], refcheck=refcheck) if self.right_node.min_left {{cmp_left}} point: - self.right_node.query(result, point) + self.right_node.query(result, point, refcheck) else: - result.extend(self.center_left_indices) + result.extend(self.center_left_indices, refcheck=refcheck) def __repr__(self): if self.is_leaf_node: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8437861bea19e..de7635512ec25 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -551,10 +551,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(values)) uniques = vec_klass() check_nulls = not is_integer_dtype(original) - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, refcheck=False) labels = _ensure_platform_int(labels) - uniques = uniques.to_array() + uniques = uniques.to_array(refcheck=False) if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, From 4acc4cd356f3c4ebc29aab7b0dc8463782751ca4 Mon Sep 17 00:00:00 2001 From: mattip Date: Tue, 2 May 2017 07:32:30 +0300 Subject: [PATCH 2/3] COMPAT: add refcheck kwarg and percolate out to app-level for PyPy --- pandas/_libs/hashtable.pyx | 5 +++-- pandas/_libs/hashtable_class_helper.pxi.in | 13 +++++-------- pandas/core/algorithms.py | 3 ++- pandas/core/reshape/merge.py | 2 +- pandas/tests/test_algos.py | 4 ++-- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 4ad658c6b16c4..c77b470aab11e 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -65,13 +65,14 @@ cdef class Factorizer: array([ 0, 1, 20]) """ labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, check_null) + self.count, na_sentinel, + check_null, refcheck=False) mask = (labels == na_sentinel) # sort on if sort: if labels.dtype != np.intp: labels = labels.astype(np.intp) - sorter = self.uniques.to_array().argsort() + sorter = self.uniques.to_array(refcheck=False).argsort() reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) labels = reverse_indexer.take(labels, mode='clip') diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f7e18178d80db..5c8cb76571267 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -573,7 +573,7 @@ cdef class StringHashTable(HashTable): # uniques uniques = ObjectVector() for i in range(count): - uniques.append(values[uindexer[i]]) + uniques.append(values[uindexer[i]], refcheck=False) return uniques.to_array(refcheck=False) def factorize(self, ndarray[object] values): @@ -657,7 +657,6 @@ cdef class StringHashTable(HashTable): char **vecs khiter_t k - # refcheck ignored, for compatibility only # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) @@ -697,7 +696,7 @@ cdef class StringHashTable(HashTable): # uniques for i in range(count): - uniques.append(values[uindexer[i]]) + uniques.append(values[uindexer[i]], refcheck=refcheck) return np.asarray(labels) @@ -811,10 +810,10 @@ cdef class PyObjectHashTable(HashTable): k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: kh_put_pymap(self.table, val, &ret) - uniques.append(val) + uniques.append(val, refcheck=False) elif not seen_na: seen_na = 1 - uniques.append(nan) + uniques.append(nan, refcheck=False) return uniques.to_array(refcheck=False) @@ -829,8 +828,6 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k - # refcheck ignored, for compatibility only - labels = np.empty(n, dtype=np.int64) for i in range(n): @@ -848,7 +845,7 @@ cdef class PyObjectHashTable(HashTable): else: k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = count - uniques.append(val) + uniques.append(val, refcheck=refcheck) labels[i] = count count += 1 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index de7635512ec25..78747225c1dc4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -551,7 +551,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(values)) uniques = vec_klass() check_nulls = not is_integer_dtype(original) - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, refcheck=False) + labels = table.get_labels(values, uniques, 0, na_sentinel, + check_nulls, refcheck=False) labels = _ensure_platform_int(labels) uniques = uniques.to_array(refcheck=False) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1ca3786ecc174..de020b22143da 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1407,7 +1407,7 @@ def _factorize_keys(lk, rk, sort=True): count = rizer.get_count() if sort: - uniques = rizer.uniques.to_array() + uniques = rizer.uniques.to_array(refcheck=False) llab, rlab = _sort_labels(uniques, llab, rlab) # NA group diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index df267f2374051..54d913879f556 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1069,8 +1069,8 @@ def _test_vector_resize(htable, uniques, dtype, nvals): # get_labels appends to the vector htable.get_labels(vals[:nvals], uniques, 0, -1) # to_array resizes the vector - uniques.to_array() - htable.get_labels(vals, uniques, 0, -1) + uniques.to_array(refcheck=False) + htable.get_labels(vals, uniques, 0, -1, refcheck=False) test_cases = [ (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), From 9aaf521233c47556620675273820f16be071fa95 Mon Sep 17 00:00:00 2001 From: mattip Date: Tue, 2 May 2017 18:52:27 +0300 Subject: [PATCH 3/3] TST: show how refcheck semantics leak to user space --- pandas/tests/test_algos.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 54d913879f556..5a12f3e0ed201 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1064,25 +1064,33 @@ def test_vector_resize(self): # Test for memory errors after internal vector # reallocations (pull request #7157) - def _test_vector_resize(htable, uniques, dtype, nvals): + def _test_vector_resize(htable, uniques, dtype, nvals, tmp_changes): vals = np.array(np.random.randn(1000), dtype=dtype) # get_labels appends to the vector htable.get_labels(vals[:nvals], uniques, 0, -1) # to_array resizes the vector - uniques.to_array(refcheck=False) + tmp = uniques.to_array(refcheck=False) + oldshape = tmp.shape htable.get_labels(vals, uniques, 0, -1, refcheck=False) + with pytest.raises(ValueError): + # cannot resize, tmp is holding a reference + tmp2 = uniques.to_array() #refcheck=True is default value + # DANGEROUS - could change tmp + uniques.to_array(refcheck=False) + if tmp_changes: + assert oldshape != tmp.shape test_cases = [ - (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), - (hashtable.StringHashTable, hashtable.ObjectVector, 'object'), - (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), - (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'), - (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')] + (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object', True), + (hashtable.StringHashTable, hashtable.ObjectVector, 'object', False), + (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64', True), + (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64', True), + (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64', True)] - for (tbl, vect, dtype) in test_cases: + for (tbl, vect, dtype, tmp_changes) in test_cases: # resizing to empty is a special case - _test_vector_resize(tbl(), vect(), dtype, 0) - _test_vector_resize(tbl(), vect(), dtype, 10) + _test_vector_resize(tbl(), vect(), dtype, 0, tmp_changes) + _test_vector_resize(tbl(), vect(), dtype, 10, tmp_changes) def test_quantile():