From 29075068d1869a2949bce8db789f5651a5cf83e6 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 3 May 2017 18:42:50 -0500 Subject: [PATCH 1/3] COMPAT/PERF remove arrays from Vector classes --- pandas/_libs/hashtable.pxd | 3 +- pandas/_libs/hashtable.pyx | 4 +- pandas/_libs/hashtable_class_helper.pxi.in | 81 +++++++++++++--------- 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 9b352ae1c003b..5b4e6e99e5e93 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -52,8 +52,9 @@ cdef struct Int64VectorData: cdef class Int64Vector: cdef Int64VectorData *data cdef ndarray ao + cdef bint external_view_exists - cdef resize(self) + cdef void resize(self) nogil cpdef to_array(self) cdef inline void append(self, int64_t x) cdef extend(self, int64_t[:] x) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c8aedcef77502..d7e7c017f04e9 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -5,12 +5,12 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check from khash cimport * from numpy cimport * -from libc.stdlib cimport malloc, free +from libc.stdlib cimport malloc, free, realloc from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free, PyString_Check, PyBytes_Check, PyUnicode_Check) -from util cimport _checknan +from util cimport _checknan, set_array_owndata cimport util import numpy as np diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ce82dace40a9..cfacfc3be6e49 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -58,10 +58,10 @@ cdef inline bint needs_resize(vector_data *data) nogil: {{py: -# name, dtype, arg, idtype -dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), - ('UInt64', 'uint64', 'uint64_t', 'np.uint64'), - ('Int64', 'int64', 'int64_t', 'np.int64')] +# name, dtype, arg, type_id +dtypes = [('Float64', 'float64', 'float64_t', 'cnp.NPY_FLOAT64'), + ('UInt64', 'uint64', 'uint64_t', 'cnp.NPY_UINT64'), + ('Int64', 'int64', 'int64_t', 'cnp.NPY_INT64')] }} @@ -71,6 +71,7 @@ cdef class {{name}}Vector: {{if dtype != 'int64'}} cdef: + bint external_view_exists {{name}}VectorData *data ndarray ao {{endif}} @@ -80,18 +81,21 @@ cdef class {{name}}Vector: sizeof({{name}}VectorData)) if not self.data: raise MemoryError() + self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype={{idtype}}) - self.data.data = <{{arg}}*> self.ao.data + self.data.data = <{{arg}}*> malloc(self.data.m * sizeof({{arg}})) - cdef resize(self): + cdef void resize(self) nogil: + # TODO: handle failure to allocate self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = <{{arg}}*> self.ao.data + self.data.data = <{{arg}}*> realloc(self.data.data, self.data.m * sizeof({{arg}})) def __dealloc__(self): if self.data is not NULL: + if not self.external_view_exists and self.data.data: + # buffer was never returned as array, so free + free(self.data.data) PyMem_Free(self.data) self.data = NULL @@ -99,9 +103,19 @@ cdef class {{name}}Vector: return self.data.n cpdef to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao + cdef: + ndarray ao + cnp.npy_intp shape[1] + if self.external_view_exists: + raise ValueError("Vector.to_array() can only be called once") + + self.data.data = <{{arg}}*> realloc(self.data.data, self.data.n * sizeof({{arg}})) + self.external_view_exists = True + shape[0] = self.data.n + ao = cnp.PyArray_SimpleNewFromData(1, shape, {{idtype}}, self.data.data) + # ownership transfer so numpy eventually frees + set_array_owndata(ao) + return ao cdef inline void append(self, {{arg}} x): @@ -120,32 +134,28 @@ cdef class StringVector: cdef: StringVectorData *data + bint external_view_exists def __cinit__(self): self.data = PyMem_Malloc( sizeof(StringVectorData)) if not self.data: raise MemoryError() + self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) - cdef resize(self): - cdef: - char **orig_data - size_t i, m - - m = self.data.m + cdef void resize(self) nogil: self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - # TODO: can resize? - orig_data = self.data.data - self.data.data = malloc(self.data.m * sizeof(char *)) - for i in range(m): - self.data.data[i] = orig_data[i] + self.data.data = realloc(self.data.data, + self.data.m * sizeof(char *)) def __dealloc__(self): if self.data is not NULL: + # unlike numeric vectors, strings were copied + # into python objects so always free if self.data.data is not NULL: free(self.data.data) PyMem_Free(self.data) @@ -159,12 +169,15 @@ cdef class StringVector: ndarray ao size_t n object val + if self.external_view_exists: + raise ValueError("Vector.to_array() can only be called once") ao = np.empty(self.data.n, dtype=np.object) for i in range(self.data.n): val = self.data.data[i] ao[i] = val self.data.m = self.data.n + self.external_view_exists = True return ao cdef inline void append(self, char * x): @@ -181,8 +194,10 @@ cdef class ObjectVector: PyObject **data size_t n, m ndarray ao + bint external_view_exists def __cinit__(self): + self.external_view_exists = False self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) @@ -194,7 +209,7 @@ cdef class ObjectVector: cdef inline append(self, object o): if self.n == self.m: self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m) + self.ao.resize(self.m, refcheck=False) self.data = self.ao.data Py_INCREF(o) @@ -202,8 +217,11 @@ cdef class ObjectVector: self.n += 1 def to_array(self): + if self.external_view_exists: + raise ValueError("Vector.to_array() can only be called once") self.ao.resize(self.n) self.m = self.n + self.external_view_exists = True return self.ao @@ -361,8 +379,7 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = count if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, val) labels[i] = count count += 1 @@ -404,8 +421,7 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = count if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, val) labels[i] = count count += 1 @@ -437,22 +453,19 @@ cdef class {{name}}HashTable(HashTable): if k == self.table.n_buckets: kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, val) elif not seen_na: seen_na = 1 if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, NAN) {{else}} k = kh_get_{{dtype}}(self.table, val) if k == self.table.n_buckets: kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): - with gil: - uniques.resize() + uniques.resize() append_data_{{dtype}}(ud, val) {{endif}} From 2cb49fa4143ff34ecbac3b517a9326f45bda03f2 Mon Sep 17 00:00:00 2001 From: mattip Date: Thu, 4 May 2017 06:19:58 +0300 Subject: [PATCH 2/3] COMPAT/PERF add checks, adapt test, fix data.m after realloc --- pandas/_libs/hashtable_class_helper.pxi.in | 24 +++++++++++++-- pandas/tests/test_algos.py | 35 ++++++++++++++-------- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index cfacfc3be6e49..f035ebf93d997 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -85,11 +85,15 @@ cdef class {{name}}Vector: self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = <{{arg}}*> malloc(self.data.m * sizeof({{arg}})) + if not self.data.data: + raise MemoryError() cdef void resize(self) nogil: - # TODO: handle failure to allocate self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.data.data = <{{arg}}*> realloc(self.data.data, self.data.m * sizeof({{arg}})) + if not self.data.data: + with gil: + raise MemoryError() def __dealloc__(self): if self.data is not NULL: @@ -110,6 +114,7 @@ cdef class {{name}}Vector: raise ValueError("Vector.to_array() can only be called once") self.data.data = <{{arg}}*> realloc(self.data.data, self.data.n * sizeof({{arg}})) + self.data.m = self.data.n self.external_view_exists = True shape[0] = self.data.n ao = cnp.PyArray_SimpleNewFromData(1, shape, {{idtype}}, self.data.data) @@ -120,6 +125,8 @@ cdef class {{name}}Vector: cdef inline void append(self, {{arg}} x): if needs_resize(self.data): + if self.external_view_exists: + raise ValueError("external reference but Vector.resize() needed") self.resize() append_data_{{dtype}}(self.data, x) @@ -145,12 +152,17 @@ cdef class StringVector: self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) + if not self.data.data: + raise MemoryError() cdef void resize(self) nogil: self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.data.data = realloc(self.data.data, self.data.m * sizeof(char *)) + if not self.data.data: + with gil: + raise MemoryError() def __dealloc__(self): if self.data is not NULL: @@ -183,6 +195,8 @@ cdef class StringVector: cdef inline void append(self, char * x): if needs_resize(self.data): + if self.external_view_exists: + raise ValueError("external reference but Vector.resize() needed") self.resize() append_data_string(self.data, x) @@ -208,6 +222,8 @@ cdef class ObjectVector: cdef inline append(self, object o): if self.n == self.m: + if self.external_view_exists: + raise ValueError("external reference but Vector.resize() needed") self.m = max(self.m * 2, _INIT_VEC_CAP) self.ao.resize(self.m, refcheck=False) self.data = self.ao.data @@ -219,7 +235,7 @@ cdef class ObjectVector: def to_array(self): if self.external_view_exists: raise ValueError("Vector.to_array() can only be called once") - self.ao.resize(self.n) + self.ao.resize(self.n, refcheck=False) self.m = self.n self.external_view_exists = True return self.ao @@ -380,6 +396,7 @@ cdef class {{name}}HashTable(HashTable): if needs_resize(ud): uniques.resize() + ud = uniques.data append_data_{{dtype}}(ud, val) labels[i] = count count += 1 @@ -454,11 +471,13 @@ cdef class {{name}}HashTable(HashTable): kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): uniques.resize() + ud = uniques.data append_data_{{dtype}}(ud, val) elif not seen_na: seen_na = 1 if needs_resize(ud): uniques.resize() + ud = uniques.data append_data_{{dtype}}(ud, NAN) {{else}} k = kh_get_{{dtype}}(self.table, val) @@ -466,6 +485,7 @@ cdef class {{name}}HashTable(HashTable): kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): uniques.resize() + ud = uniques.data append_data_{{dtype}}(ud, val) {{endif}} diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 86d9ab3643cc9..e3d0654c604e6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1070,25 +1070,36 @@ def test_vector_resize(self): # Test for memory errors after internal vector # reallocations (pull request #7157) - def _test_vector_resize(htable, uniques, dtype, nvals): + def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes): vals = np.array(np.random.randn(1000), dtype=dtype) # get_labels appends to the vector htable.get_labels(vals[:nvals], uniques, 0, -1) - # to_array resizes the vector - uniques.to_array() - htable.get_labels(vals, uniques, 0, -1) + # to_array may resize the vector + tmp = uniques.to_array() + oldshape = tmp.shape + if safely_resizes: + htable.get_labels(vals, uniques, 0, -1) + else: + with pytest.raises(ValueError) as excinfo: + htable.get_labels(vals, uniques, 0, -1) + assert excinfo.value.message == 'external reference but Vector.resize() needed' + with pytest.raises(ValueError) as excinfo: + # cannot resize, tmp is holding a reference + tmp2 = uniques.to_array() + assert excinfo.value.message == 'Vector.to_array() can only be called once' + assert tmp.shape == oldshape test_cases = [ - (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), - (hashtable.StringHashTable, hashtable.ObjectVector, 'object'), - (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), - (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'), - (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')] + (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object', False), + (hashtable.StringHashTable, hashtable.ObjectVector, 'object', True), + (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64', True), + (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64', True), + (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64', True)] - for (tbl, vect, dtype) in test_cases: + for (tbl, vect, dtype, safely_resizes) in test_cases: # resizing to empty is a special case - _test_vector_resize(tbl(), vect(), dtype, 0) - _test_vector_resize(tbl(), vect(), dtype, 10) + _test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes) + _test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes) def test_quantile(): From c8eff395778878b339eef42f619ad0499be697a7 Mon Sep 17 00:00:00 2001 From: mattip Date: Fri, 5 May 2017 11:39:37 +0300 Subject: [PATCH 3/3] COMPAT/PERF carefully allow multiple calls of to_array() --- pandas/_libs/hashtable_class_helper.pxi.in | 26 +++++++++------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f035ebf93d997..ec740c10b239d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -110,11 +110,11 @@ cdef class {{name}}Vector: cdef: ndarray ao cnp.npy_intp shape[1] - if self.external_view_exists: - raise ValueError("Vector.to_array() can only be called once") - - self.data.data = <{{arg}}*> realloc(self.data.data, self.data.n * sizeof({{arg}})) - self.data.m = self.data.n + if self.data.m != self.data.n: + if self.external_view_exists: + raise ValueError("Vector.to_array() can only be called once") + self.data.data = <{{arg}}*> realloc(self.data.data, self.data.n * sizeof({{arg}})) + self.data.m = self.data.n self.external_view_exists = True shape[0] = self.data.n ao = cnp.PyArray_SimpleNewFromData(1, shape, {{idtype}}, self.data.data) @@ -181,24 +181,17 @@ cdef class StringVector: ndarray ao size_t n object val - if self.external_view_exists: - raise ValueError("Vector.to_array() can only be called once") - ao = np.empty(self.data.n, dtype=np.object) for i in range(self.data.n): val = self.data.data[i] ao[i] = val - self.data.m = self.data.n self.external_view_exists = True return ao cdef inline void append(self, char * x): if needs_resize(self.data): - if self.external_view_exists: - raise ValueError("external reference but Vector.resize() needed") self.resize() - append_data_string(self.data, x) @@ -233,10 +226,11 @@ cdef class ObjectVector: self.n += 1 def to_array(self): - if self.external_view_exists: - raise ValueError("Vector.to_array() can only be called once") - self.ao.resize(self.n, refcheck=False) - self.m = self.n + if self.m != self.n: + if self.external_view_exists: + raise ValueError("Vector.to_array() can only be called once") + self.ao.resize(self.n, refcheck=False) + self.m = self.n self.external_view_exists = True return self.ao