Skip to content

Vector array #16224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ cdef struct Int64VectorData:
cdef class Int64Vector:
cdef Int64VectorData *data
cdef ndarray ao
cdef bint external_view_exists

cdef resize(self)
cdef void resize(self) nogil
cpdef to_array(self)
cdef inline void append(self, int64_t x)
cdef extend(self, int64_t[:] x)
4 changes: 2 additions & 2 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
from khash cimport *
from numpy cimport *

from libc.stdlib cimport malloc, free
from libc.stdlib cimport malloc, free, realloc
from cpython cimport (PyMem_Malloc, PyMem_Realloc, PyMem_Free,
PyString_Check, PyBytes_Check,
PyUnicode_Check)

from util cimport _checknan
from util cimport _checknan, set_array_owndata
cimport util

import numpy as np
Expand Down
105 changes: 66 additions & 39 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ cdef inline bint needs_resize(vector_data *data) nogil:

{{py:

# name, dtype, arg, idtype
dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'),
('UInt64', 'uint64', 'uint64_t', 'np.uint64'),
('Int64', 'int64', 'int64_t', 'np.int64')]
# name, dtype, arg, type_id
dtypes = [('Float64', 'float64', 'float64_t', 'cnp.NPY_FLOAT64'),
('UInt64', 'uint64', 'uint64_t', 'cnp.NPY_UINT64'),
('Int64', 'int64', 'int64_t', 'cnp.NPY_INT64')]

}}

Expand All @@ -71,6 +71,7 @@ cdef class {{name}}Vector:

{{if dtype != 'int64'}}
cdef:
bint external_view_exists
{{name}}VectorData *data
ndarray ao
{{endif}}
Expand All @@ -80,32 +81,52 @@ cdef class {{name}}Vector:
sizeof({{name}}VectorData))
if not self.data:
raise MemoryError()
self.external_view_exists = False
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.ao = np.empty(self.data.m, dtype={{idtype}})
self.data.data = <{{arg}}*> self.ao.data
self.data.data = <{{arg}}*> malloc(self.data.m * sizeof({{arg}}))
if not self.data.data:
raise MemoryError()

cdef resize(self):
cdef void resize(self) nogil:
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.m)
self.data.data = <{{arg}}*> self.ao.data
self.data.data = <{{arg}}*> realloc(self.data.data, self.data.m * sizeof({{arg}}))
if not self.data.data:
with gil:
raise MemoryError()

def __dealloc__(self):
if self.data is not NULL:
if not self.external_view_exists and self.data.data:
# buffer was never returned as array, so free
free(self.data.data)
PyMem_Free(self.data)
self.data = NULL

def __len__(self):
return self.data.n

cpdef to_array(self):
self.ao.resize(self.data.n)
self.data.m = self.data.n
return self.ao
cdef:
ndarray ao
cnp.npy_intp shape[1]
if self.data.m != self.data.n:
if self.external_view_exists:
raise ValueError("Vector.to_array() can only be called once")
self.data.data = <{{arg}}*> realloc(self.data.data, self.data.n * sizeof({{arg}}))
self.data.m = self.data.n
self.external_view_exists = True
shape[0] = self.data.n
ao = cnp.PyArray_SimpleNewFromData(1, shape, {{idtype}}, <void*>self.data.data)
# ownership transfer so numpy eventually frees
set_array_owndata(ao)
return ao

cdef inline void append(self, {{arg}} x):

if needs_resize(self.data):
if self.external_view_exists:
raise ValueError("external reference but Vector.resize() needed")
self.resize()

append_data_{{dtype}}(self.data, x)
Expand All @@ -120,32 +141,33 @@ cdef class StringVector:

cdef:
StringVectorData *data
bint external_view_exists

def __cinit__(self):
self.data = <StringVectorData *>PyMem_Malloc(
sizeof(StringVectorData))
if not self.data:
raise MemoryError()
self.external_view_exists = False
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()

cdef resize(self):
cdef:
char **orig_data
size_t i, m

m = self.data.m
cdef void resize(self) nogil:
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)

# TODO: can resize?
orig_data = self.data.data
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
for i in range(m):
self.data.data[i] = orig_data[i]
self.data.data = <char **> realloc(self.data.data,
self.data.m * sizeof(char *))
if not self.data.data:
with gil:
raise MemoryError()

def __dealloc__(self):
if self.data is not NULL:
# unlike numeric vectors, strings were copied
# into python objects so always free
if self.data.data is not NULL:
free(self.data.data)
PyMem_Free(self.data)
Expand All @@ -159,19 +181,17 @@ cdef class StringVector:
ndarray ao
size_t n
object val

ao = np.empty(self.data.n, dtype=np.object)
for i in range(self.data.n):
val = self.data.data[i]
ao[i] = val
self.data.m = self.data.n
self.external_view_exists = True
return ao

cdef inline void append(self, char * x):

if needs_resize(self.data):
self.resize()

append_data_string(self.data, x)


Expand All @@ -181,8 +201,10 @@ cdef class ObjectVector:
PyObject **data
size_t n, m
ndarray ao
bint external_view_exists

def __cinit__(self):
self.external_view_exists = False
self.n = 0
self.m = _INIT_VEC_CAP
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
Expand All @@ -193,17 +215,23 @@ cdef class ObjectVector:

cdef inline append(self, object o):
if self.n == self.m:
if self.external_view_exists:
raise ValueError("external reference but Vector.resize() needed")
self.m = max(self.m * 2, _INIT_VEC_CAP)
self.ao.resize(self.m)
self.ao.resize(self.m, refcheck=False)
self.data = <PyObject**> self.ao.data

Py_INCREF(o)
self.data[self.n] = <PyObject*> o
self.n += 1

def to_array(self):
self.ao.resize(self.n)
self.m = self.n
if self.m != self.n:
if self.external_view_exists:
raise ValueError("Vector.to_array() can only be called once")
self.ao.resize(self.n, refcheck=False)
self.m = self.n
self.external_view_exists = True
return self.ao


Expand Down Expand Up @@ -361,8 +389,8 @@ cdef class {{name}}HashTable(HashTable):
self.table.vals[k] = count

if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize()
ud = uniques.data
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
Expand Down Expand Up @@ -404,8 +432,7 @@ cdef class {{name}}HashTable(HashTable):
self.table.vals[k] = count

if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize()
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
Expand Down Expand Up @@ -437,22 +464,22 @@ cdef class {{name}}HashTable(HashTable):
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize()
ud = uniques.data
append_data_{{dtype}}(ud, val)
elif not seen_na:
seen_na = 1
if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize()
ud = uniques.data
append_data_{{dtype}}(ud, NAN)
{{else}}
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize()
ud = uniques.data
append_data_{{dtype}}(ud, val)
{{endif}}

Expand Down
35 changes: 23 additions & 12 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,25 +1070,36 @@ def test_vector_resize(self):
# Test for memory errors after internal vector
# reallocations (pull request #7157)

def _test_vector_resize(htable, uniques, dtype, nvals):
def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
vals = np.array(np.random.randn(1000), dtype=dtype)
# get_labels appends to the vector
htable.get_labels(vals[:nvals], uniques, 0, -1)
# to_array resizes the vector
uniques.to_array()
htable.get_labels(vals, uniques, 0, -1)
# to_array may resize the vector
tmp = uniques.to_array()
oldshape = tmp.shape
if safely_resizes:
htable.get_labels(vals, uniques, 0, -1)
else:
with pytest.raises(ValueError) as excinfo:
htable.get_labels(vals, uniques, 0, -1)
assert excinfo.value.message == 'external reference but Vector.resize() needed'
with pytest.raises(ValueError) as excinfo:
# cannot resize, tmp is holding a reference
tmp2 = uniques.to_array()
assert excinfo.value.message == 'Vector.to_array() can only be called once'
assert tmp.shape == oldshape

test_cases = [
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object', False),
(hashtable.StringHashTable, hashtable.ObjectVector, 'object', True),
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64', True),
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64', True),
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64', True)]

for (tbl, vect, dtype) in test_cases:
for (tbl, vect, dtype, safely_resizes) in test_cases:
# resizing to empty is a special case
_test_vector_resize(tbl(), vect(), dtype, 0)
_test_vector_resize(tbl(), vect(), dtype, 10)
_test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes)
_test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes)


def test_quantile():
Expand Down