@@ -355,6 +355,45 @@ cdef class {{name}}HashTable(HashTable):
355
355
356
356
return np.asarray(locs)
357
357
358
+ @cython.boundscheck(False)
359
+ def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
360
+ cdef:
361
+ Py_ssize_t i, idx, count = 0, n = len(values)
362
+ int64_t[:] labels
363
+ int ret = 0
364
+ {{dtype}}_t val
365
+ khiter_t k
366
+ {{name}}Vector uniques = {{name}}Vector()
367
+ {{name}}VectorData *ud
368
+
369
+ ud = uniques.data
370
+ if return_inverse:
371
+ labels = np.empty(n, dtype=np.int64)
372
+
373
+ with nogil:
374
+ for i in range(n):
375
+ val = values[i]
376
+ k = kh_get_{{dtype}}(self.table, val)
377
+ if return_inverse and k != self.table.n_buckets:
378
+ # k falls into a previous bucket
379
+ idx = self.table.vals[k]
380
+ labels[i] = idx
381
+ elif k == self.table.n_buckets:
382
+ # k hasn't been seen yet
383
+ k = kh_put_{{dtype}}(self.table, val, &ret)
384
+ if needs_resize(ud):
385
+ with gil:
386
+ uniques.resize()
387
+ append_data_{{dtype}}(ud, val)
388
+ if return_inverse:
389
+ self.table.vals[k] = count
390
+ labels[i] = count
391
+ count += 1
392
+
393
+ if return_inverse:
394
+ return uniques.to_array(), np.asarray(labels)
395
+ return uniques.to_array()
396
+
358
397
def factorize(self, {{dtype}}_t[:] values):
359
398
uniques = {{name}}Vector()
360
399
labels = self.get_labels(values, uniques, 0)
@@ -465,45 +504,6 @@ cdef class {{name}}HashTable(HashTable):
465
504
466
505
return np.asarray(labels), arr_uniques
467
506
468
- @cython.boundscheck(False)
469
- def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
470
- cdef:
471
- Py_ssize_t i, idx, count = 0, n = len(values)
472
- int64_t[:] labels
473
- int ret = 0
474
- {{dtype}}_t val
475
- khiter_t k
476
- {{name}}Vector uniques = {{name}}Vector()
477
- {{name}}VectorData *ud
478
-
479
- ud = uniques.data
480
- if return_inverse:
481
- labels = np.empty(n, dtype=np.int64)
482
-
483
- with nogil:
484
- for i in range(n):
485
- val = values[i]
486
- k = kh_get_{{dtype}}(self.table, val)
487
- if return_inverse and k != self.table.n_buckets:
488
- # k falls into a previous bucket
489
- idx = self.table.vals[k]
490
- labels[i] = idx
491
- elif k == self.table.n_buckets:
492
- # k hasn't been seen yet
493
- k = kh_put_{{dtype}}(self.table, val, &ret)
494
- if needs_resize(ud):
495
- with gil:
496
- uniques.resize()
497
- append_data_{{dtype}}(ud, val)
498
- if return_inverse:
499
- self.table.vals[k] = count
500
- labels[i] = count
501
- count += 1
502
-
503
- if return_inverse:
504
- return uniques.to_array(), np.asarray(labels)
505
- return uniques.to_array()
506
-
507
507
{{endfor}}
508
508
509
509
@@ -583,59 +583,6 @@ cdef class StringHashTable(HashTable):
583
583
free(vecs)
584
584
return labels
585
585
586
- @cython.boundscheck(False)
587
- def unique(self, ndarray[object] values, bint return_inverse=False):
588
- cdef:
589
- Py_ssize_t i, idx, count = 0, n = len(values)
590
- int64_t[:] labels
591
- int64_t[:] uindexer
592
- int ret = 0
593
- object val
594
- ObjectVector uniques = ObjectVector()
595
- khiter_t k
596
- const char *v
597
- const char **vecs
598
-
599
- if return_inverse:
600
- labels = np.zeros(n, dtype=np.int64)
601
- uindexer = np.empty(n, dtype=np.int64)
602
-
603
- # assign pointers
604
- vecs = <const char **> malloc(n * sizeof(char *))
605
- for i in range(n):
606
- val = values[i]
607
- v = util.get_c_string(val)
608
- vecs[i] = v
609
-
610
-
611
- # compute
612
- with nogil:
613
- for i in range(n):
614
- v = vecs[i]
615
- k = kh_get_str(self.table, v)
616
- if return_inverse and k != self.table.n_buckets:
617
- # k falls into a previous bucket
618
- idx = self.table.vals[k]
619
- labels[i] = <int64_t>idx
620
- elif k == self.table.n_buckets:
621
- # k hasn't been seen yet
622
- k = kh_put_str(self.table, v, &ret)
623
- uindexer[count] = i
624
- if return_inverse:
625
- self.table.vals[k] = count
626
- labels[i] = <int64_t>count
627
- count += 1
628
-
629
- free(vecs)
630
-
631
- # uniques
632
- for i in range(count):
633
- uniques.append(values[uindexer[i]])
634
-
635
- if return_inverse:
636
- return uniques.to_array(), np.asarray(labels)
637
- return uniques.to_array()
638
-
639
586
@cython.boundscheck(False)
640
587
def lookup(self, ndarray[object] values):
641
588
cdef:
@@ -697,6 +644,59 @@ cdef class StringHashTable(HashTable):
697
644
self.table.vals[k] = i
698
645
free(vecs)
699
646
647
+ @cython.boundscheck(False)
648
+ def unique(self, ndarray[object] values, bint return_inverse=False):
649
+ cdef:
650
+ Py_ssize_t i, idx, count = 0, n = len(values)
651
+ int64_t[:] labels
652
+ int64_t[:] uindexer
653
+ int ret = 0
654
+ object val
655
+ ObjectVector uniques = ObjectVector()
656
+ khiter_t k
657
+ const char *v
658
+ const char **vecs
659
+
660
+ if return_inverse:
661
+ labels = np.zeros(n, dtype=np.int64)
662
+ uindexer = np.empty(n, dtype=np.int64)
663
+
664
+ # assign pointers
665
+ vecs = <const char **> malloc(n * sizeof(char *))
666
+ for i in range(n):
667
+ val = values[i]
668
+ v = util.get_c_string(val)
669
+ vecs[i] = v
670
+
671
+
672
+ # compute
673
+ with nogil:
674
+ for i in range(n):
675
+ v = vecs[i]
676
+ k = kh_get_str(self.table, v)
677
+ if return_inverse and k != self.table.n_buckets:
678
+ # k falls into a previous bucket
679
+ idx = self.table.vals[k]
680
+ labels[i] = <int64_t>idx
681
+ elif k == self.table.n_buckets:
682
+ # k hasn't been seen yet
683
+ k = kh_put_str(self.table, v, &ret)
684
+ uindexer[count] = i
685
+ if return_inverse:
686
+ self.table.vals[k] = count
687
+ labels[i] = <int64_t>count
688
+ count += 1
689
+
690
+ free(vecs)
691
+
692
+ # uniques
693
+ for i in range(count):
694
+ uniques.append(values[uindexer[i]])
695
+
696
+ if return_inverse:
697
+ return uniques.to_array(), np.asarray(labels)
698
+ return uniques.to_array()
699
+
700
700
@cython.boundscheck(False)
701
701
def get_labels(self, ndarray[object] values, ObjectVector uniques,
702
702
Py_ssize_t count_prior=0, int64_t na_sentinel=-1,
0 commit comments