Skip to content

Pypy refcheck #16193

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ cdef class Int64Vector:
cdef Int64VectorData *data
cdef ndarray ao

cdef resize(self)
cpdef to_array(self)
cdef inline void append(self, int64_t x)
cdef extend(self, int64_t[:] x)
cdef resize(self, refcheck=*)
cpdef to_array(self, refcheck=*)
cdef inline void append(self, int64_t x, refcheck=*)
cdef extend(self, int64_t[:] x, refcheck=*)
13 changes: 7 additions & 6 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,14 @@ cdef class Factorizer:
array([ 0, 1, 20])
"""
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel, check_null)
self.count, na_sentinel,
check_null, refcheck=False)
mask = (labels == na_sentinel)
# sort on
if sort:
if labels.dtype != np.intp:
labels = labels.astype(np.intp)
sorter = self.uniques.to_array().argsort()
sorter = self.uniques.to_array(refcheck=False).argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))
labels = reverse_indexer.take(labels, mode='clip')
Expand Down Expand Up @@ -101,14 +102,14 @@ cdef class Int64Factorizer:
na_sentinel=-1, check_null=True):
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel,
check_null)
check_null, refcheck=False)

# sort on
if sort:
if labels.dtype != np.intp:
labels = labels.astype(np.intp)

sorter = self.uniques.to_array().argsort()
sorter = self.uniques.to_array(refcheck=False).argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))

Expand Down Expand Up @@ -142,12 +143,12 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels):
if ret != 0:
if needs_resize(ud):
with gil:
idx.resize()
idx.resize(refcheck=False)
append_data_int64(ud, i)

kh_destroy_int64(table)

arr = idx.to_array()
arr = idx.to_array(refcheck=False)
arr = arr[labels[arr].argsort()]

return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
82 changes: 43 additions & 39 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,11 @@ cdef class {{name}}Vector:
self.ao = np.empty(self.data.m, dtype={{idtype}})
self.data.data = <{{arg}}*> self.ao.data

cdef resize(self):
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.m)
cdef resize(self, refcheck=True):
m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(m, refcheck=refcheck) # could raise, change m later
self.data.data = <{{arg}}*> self.ao.data
self.data.m = m

def __dealloc__(self):
if self.data is not NULL:
Expand All @@ -98,21 +99,21 @@ cdef class {{name}}Vector:
def __len__(self):
return self.data.n

cpdef to_array(self):
self.ao.resize(self.data.n)
cpdef to_array(self, refcheck=True):
self.ao.resize(self.data.n, refcheck=refcheck)
self.data.m = self.data.n
self.data.data = <{{arg}}*> self.ao.data
return self.ao

cdef inline void append(self, {{arg}} x):
cdef inline void append(self, {{arg}} x, refcheck=True):

if needs_resize(self.data):
self.resize()

self.resize(refcheck=refcheck)
append_data_{{dtype}}(self.data, x)

cdef extend(self, {{arg}}[:] x):
cdef extend(self, {{arg}}[:] x, refcheck=True):
for i in range(len(x)):
self.append(x[i])
self.append(x[i], refcheck=refcheck)

{{endfor}}

Expand All @@ -130,11 +131,12 @@ cdef class StringVector:
self.data.m = _INIT_VEC_CAP
self.data.data = <char **> malloc(self.data.m * sizeof(char *))

cdef resize(self):
cdef resize(self, refcheck=True):
cdef:
char **orig_data
size_t i, m

# refcheck ignored, for compatibility only
m = self.data.m
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)

Expand All @@ -154,23 +156,24 @@ cdef class StringVector:
def __len__(self):
return self.data.n

def to_array(self):
def to_array(self, refcheck=True):
cdef:
ndarray ao
size_t n
object val

# refcheck ignored, for compatibility only
ao = np.empty(self.data.n, dtype=np.object)
for i in range(self.data.n):
val = self.data.data[i]
ao[i] = val
self.data.m = self.data.n
return ao

cdef inline void append(self, char * x):
cdef inline void append(self, char * x, refcheck=True):

if needs_resize(self.data):
self.resize()
self.resize(refcheck=refcheck)

append_data_string(self.data, x)

Expand All @@ -191,18 +194,18 @@ cdef class ObjectVector:
def __len__(self):
return self.n

cdef inline append(self, object o):
cdef inline append(self, object o, refcheck=True):
if self.n == self.m:
self.m = max(self.m * 2, _INIT_VEC_CAP)
self.ao.resize(self.m)
self.ao.resize(self.m, refcheck=refcheck)
self.data = <PyObject**> self.ao.data

Py_INCREF(o)
self.data[self.n] = <PyObject*> o
self.n += 1

def to_array(self):
self.ao.resize(self.n)
def to_array(self, refcheck=True):
self.ao.resize(self.n, refcheck=refcheck)
self.m = self.n
return self.ao

Expand Down Expand Up @@ -324,13 +327,13 @@ cdef class {{name}}HashTable(HashTable):

def factorize(self, {{dtype}}_t values):
uniques = {{name}}Vector()
labels = self.get_labels(values, uniques, 0, 0)
return uniques.to_array(), labels
labels = self.get_labels(values, uniques, 0, 0, refcheck=False)
return uniques.to_array(refcheck=False), labels

@cython.boundscheck(False)
def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
bint check_null=True):
bint check_null=True, bint refcheck=True):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand Down Expand Up @@ -362,7 +365,7 @@ cdef class {{name}}HashTable(HashTable):

if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize(refcheck=refcheck)
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
Expand Down Expand Up @@ -405,12 +408,12 @@ cdef class {{name}}HashTable(HashTable):

if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize(refcheck=False)
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1

arr_uniques = uniques.to_array()
arr_uniques = uniques.to_array(refcheck=False)

return np.asarray(labels), arr_uniques

Expand Down Expand Up @@ -438,25 +441,25 @@ cdef class {{name}}HashTable(HashTable):
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize(refcheck=False)
append_data_{{dtype}}(ud, val)
elif not seen_na:
seen_na = 1
if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize(refcheck=False)
append_data_{{dtype}}(ud, NAN)
{{else}}
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
uniques.resize(refcheck=False)
append_data_{{dtype}}(ud, val)
{{endif}}

return uniques.to_array()
return uniques.to_array(refcheck=False)

{{endfor}}

Expand Down Expand Up @@ -570,13 +573,13 @@ cdef class StringHashTable(HashTable):
# uniques
uniques = ObjectVector()
for i in range(count):
uniques.append(values[uindexer[i]])
return uniques.to_array()
uniques.append(values[uindexer[i]], refcheck=False)
return uniques.to_array(refcheck=False)

def factorize(self, ndarray[object] values):
uniques = ObjectVector()
labels = self.get_labels(values, uniques, 0, 0)
return uniques.to_array(), labels
labels = self.get_labels(values, uniques, 0, 0, refcheck=False)
return uniques.to_array(refcheck=False), labels

@cython.boundscheck(False)
def lookup(self, ndarray[object] values):
Expand Down Expand Up @@ -642,7 +645,7 @@ cdef class StringHashTable(HashTable):
@cython.boundscheck(False)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=1):
bint check_null=1, bint refcheck=1):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -654,6 +657,7 @@ cdef class StringHashTable(HashTable):
char **vecs
khiter_t k


# these by-definition *must* be strings
labels = np.zeros(n, dtype=np.int64)
uindexer = np.empty(n, dtype=np.int64)
Expand Down Expand Up @@ -692,7 +696,7 @@ cdef class StringHashTable(HashTable):

# uniques
for i in range(count):
uniques.append(values[uindexer[i]])
uniques.append(values[uindexer[i]], refcheck=refcheck)

return np.asarray(labels)

Expand Down Expand Up @@ -806,16 +810,16 @@ cdef class PyObjectHashTable(HashTable):
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
uniques.append(val, refcheck=False)
elif not seen_na:
seen_na = 1
uniques.append(nan)
uniques.append(nan, refcheck=False)

return uniques.to_array()
return uniques.to_array(refcheck=False)

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
bint check_null=True, bint refcheck=True):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -841,7 +845,7 @@ cdef class PyObjectHashTable(HashTable):
else:
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = count
uniques.append(val)
uniques.append(val, refcheck=refcheck)
labels[i] = count
count += 1

Expand Down
Loading