Skip to content

Commit 085a8eb

Browse files
committed
Pure copy/paste: Group unique/factorize functions next to each other
1 parent 79003e4 commit 085a8eb

File tree

1 file changed

+92
-92
lines changed

1 file changed

+92
-92
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

+92-92
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,45 @@ cdef class {{name}}HashTable(HashTable):
355355

356356
return np.asarray(locs)
357357

358+
@cython.boundscheck(False)
359+
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
360+
cdef:
361+
Py_ssize_t i, idx, count = 0, n = len(values)
362+
int64_t[:] labels
363+
int ret = 0
364+
{{dtype}}_t val
365+
khiter_t k
366+
{{name}}Vector uniques = {{name}}Vector()
367+
{{name}}VectorData *ud
368+
369+
ud = uniques.data
370+
if return_inverse:
371+
labels = np.empty(n, dtype=np.int64)
372+
373+
with nogil:
374+
for i in range(n):
375+
val = values[i]
376+
k = kh_get_{{dtype}}(self.table, val)
377+
if return_inverse and k != self.table.n_buckets:
378+
# k falls into a previous bucket
379+
idx = self.table.vals[k]
380+
labels[i] = idx
381+
elif k == self.table.n_buckets:
382+
# k hasn't been seen yet
383+
k = kh_put_{{dtype}}(self.table, val, &ret)
384+
if needs_resize(ud):
385+
with gil:
386+
uniques.resize()
387+
append_data_{{dtype}}(ud, val)
388+
if return_inverse:
389+
self.table.vals[k] = count
390+
labels[i] = count
391+
count += 1
392+
393+
if return_inverse:
394+
return uniques.to_array(), np.asarray(labels)
395+
return uniques.to_array()
396+
358397
def factorize(self, {{dtype}}_t[:] values):
359398
uniques = {{name}}Vector()
360399
labels = self.get_labels(values, uniques, 0)
@@ -465,45 +504,6 @@ cdef class {{name}}HashTable(HashTable):
465504

466505
return np.asarray(labels), arr_uniques
467506

468-
@cython.boundscheck(False)
469-
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
470-
cdef:
471-
Py_ssize_t i, idx, count = 0, n = len(values)
472-
int64_t[:] labels
473-
int ret = 0
474-
{{dtype}}_t val
475-
khiter_t k
476-
{{name}}Vector uniques = {{name}}Vector()
477-
{{name}}VectorData *ud
478-
479-
ud = uniques.data
480-
if return_inverse:
481-
labels = np.empty(n, dtype=np.int64)
482-
483-
with nogil:
484-
for i in range(n):
485-
val = values[i]
486-
k = kh_get_{{dtype}}(self.table, val)
487-
if return_inverse and k != self.table.n_buckets:
488-
# k falls into a previous bucket
489-
idx = self.table.vals[k]
490-
labels[i] = idx
491-
elif k == self.table.n_buckets:
492-
# k hasn't been seen yet
493-
k = kh_put_{{dtype}}(self.table, val, &ret)
494-
if needs_resize(ud):
495-
with gil:
496-
uniques.resize()
497-
append_data_{{dtype}}(ud, val)
498-
if return_inverse:
499-
self.table.vals[k] = count
500-
labels[i] = count
501-
count += 1
502-
503-
if return_inverse:
504-
return uniques.to_array(), np.asarray(labels)
505-
return uniques.to_array()
506-
507507
{{endfor}}
508508

509509

@@ -583,59 +583,6 @@ cdef class StringHashTable(HashTable):
583583
free(vecs)
584584
return labels
585585

586-
@cython.boundscheck(False)
587-
def unique(self, ndarray[object] values, bint return_inverse=False):
588-
cdef:
589-
Py_ssize_t i, idx, count = 0, n = len(values)
590-
int64_t[:] labels
591-
int64_t[:] uindexer
592-
int ret = 0
593-
object val
594-
ObjectVector uniques = ObjectVector()
595-
khiter_t k
596-
const char *v
597-
const char **vecs
598-
599-
if return_inverse:
600-
labels = np.zeros(n, dtype=np.int64)
601-
uindexer = np.empty(n, dtype=np.int64)
602-
603-
# assign pointers
604-
vecs = <const char **> malloc(n * sizeof(char *))
605-
for i in range(n):
606-
val = values[i]
607-
v = util.get_c_string(val)
608-
vecs[i] = v
609-
610-
611-
# compute
612-
with nogil:
613-
for i in range(n):
614-
v = vecs[i]
615-
k = kh_get_str(self.table, v)
616-
if return_inverse and k != self.table.n_buckets:
617-
# k falls into a previous bucket
618-
idx = self.table.vals[k]
619-
labels[i] = <int64_t>idx
620-
elif k == self.table.n_buckets:
621-
# k hasn't been seen yet
622-
k = kh_put_str(self.table, v, &ret)
623-
uindexer[count] = i
624-
if return_inverse:
625-
self.table.vals[k] = count
626-
labels[i] = <int64_t>count
627-
count += 1
628-
629-
free(vecs)
630-
631-
# uniques
632-
for i in range(count):
633-
uniques.append(values[uindexer[i]])
634-
635-
if return_inverse:
636-
return uniques.to_array(), np.asarray(labels)
637-
return uniques.to_array()
638-
639586
@cython.boundscheck(False)
640587
def lookup(self, ndarray[object] values):
641588
cdef:
@@ -697,6 +644,59 @@ cdef class StringHashTable(HashTable):
697644
self.table.vals[k] = i
698645
free(vecs)
699646

647+
@cython.boundscheck(False)
648+
def unique(self, ndarray[object] values, bint return_inverse=False):
649+
cdef:
650+
Py_ssize_t i, idx, count = 0, n = len(values)
651+
int64_t[:] labels
652+
int64_t[:] uindexer
653+
int ret = 0
654+
object val
655+
ObjectVector uniques = ObjectVector()
656+
khiter_t k
657+
const char *v
658+
const char **vecs
659+
660+
if return_inverse:
661+
labels = np.zeros(n, dtype=np.int64)
662+
uindexer = np.empty(n, dtype=np.int64)
663+
664+
# assign pointers
665+
vecs = <const char **> malloc(n * sizeof(char *))
666+
for i in range(n):
667+
val = values[i]
668+
v = util.get_c_string(val)
669+
vecs[i] = v
670+
671+
672+
# compute
673+
with nogil:
674+
for i in range(n):
675+
v = vecs[i]
676+
k = kh_get_str(self.table, v)
677+
if return_inverse and k != self.table.n_buckets:
678+
# k falls into a previous bucket
679+
idx = self.table.vals[k]
680+
labels[i] = <int64_t>idx
681+
elif k == self.table.n_buckets:
682+
# k hasn't been seen yet
683+
k = kh_put_str(self.table, v, &ret)
684+
uindexer[count] = i
685+
if return_inverse:
686+
self.table.vals[k] = count
687+
labels[i] = <int64_t>count
688+
count += 1
689+
690+
free(vecs)
691+
692+
# uniques
693+
for i in range(count):
694+
uniques.append(values[uindexer[i]])
695+
696+
if return_inverse:
697+
return uniques.to_array(), np.asarray(labels)
698+
return uniques.to_array()
699+
700700
@cython.boundscheck(False)
701701
def get_labels(self, ndarray[object] values, ObjectVector uniques,
702702
Py_ssize_t count_prior=0, int64_t na_sentinel=-1,

0 commit comments

Comments
 (0)