@@ -222,6 +222,31 @@ cdef class StringHashTable:
222
222
resbuf[i] = - 1
223
223
return labels
224
224
225
+ def unique (self , ndarray[object] values ):
226
+ cdef:
227
+ Py_ssize_t i, n = len (values)
228
+ Py_ssize_t idx, count = 0
229
+ int ret
230
+ object val
231
+ char * buf
232
+ khiter_t k
233
+ list uniques = []
234
+
235
+ for i in range (n):
236
+ val = values[i]
237
+ buf = PyString_AsString(val)
238
+ k = kh_get_str(self .table, buf)
239
+ if k == self .table.n_buckets:
240
+ k = kh_put_str(self .table, buf, & ret)
241
+ # print 'putting %s, %s' % (val, count)
242
+ if not ret:
243
+ kh_del_str(self .table, k)
244
+ count += 1
245
+ uniques.append(val)
246
+
247
+ # return None
248
+ return uniques
249
+
225
250
def factorize (self , ndarray[object] values ):
226
251
cdef:
227
252
Py_ssize_t i, n = len (values)
@@ -476,6 +501,25 @@ cdef class Int64HashTable:
476
501
477
502
return labels, counts[:count].copy()
478
503
504
+ def unique (self , ndarray[int64_t] values ):
505
+ cdef:
506
+ Py_ssize_t i, n = len (values)
507
+ Py_ssize_t idx, count = 0
508
+ int ret
509
+ int64_t val
510
+ khiter_t k
511
+ list uniques = []
512
+
513
+ for i in range (n):
514
+ val = values[i]
515
+ k = kh_get_int64(self .table, val)
516
+ if k == self .table.n_buckets:
517
+ k = kh_put_int64(self .table, val, & ret)
518
+ uniques.append(val)
519
+ count += 1
520
+
521
+ return uniques
522
+
479
523
cdef class PyObjectHashTable:
480
524
481
525
cdef:
@@ -571,9 +615,6 @@ cdef class PyObjectHashTable:
571
615
def unique (self , ndarray[object] values ):
572
616
cdef:
573
617
Py_ssize_t i, n = len (values)
574
- ndarray[int32_t] labels = np.empty(n, dtype = np.int32)
575
- ndarray[int32_t] counts = np.empty(n, dtype = np.int32)
576
- dict reverse = {}
577
618
Py_ssize_t idx, count = 0
578
619
int ret
579
620
object val
@@ -625,6 +666,22 @@ cdef class PyObjectHashTable:
625
666
626
667
return labels, counts[:count].copy()
627
668
669
+ # def unique(self, ndarray[object] values, list uniques):
670
+ # cdef:
671
+ # Py_ssize_t i, n = len(values)
672
+ # Py_ssize_t idx, count = 0
673
+ # int ret
674
+ # object val
675
+ # khiter_t k
676
+
677
+ # for i in range(n):
678
+ # val = values[i]
679
+ # k = kh_get_pymap(self.table, <PyObject*>val)
680
+ # if k == self.table.n_buckets:
681
+ # k = kh_put_pymap(self.table, <PyObject*>val, &ret)
682
+ # uniques.append(val)
683
+ # count += 1
684
+
628
685
cdef class Factorizer:
629
686
630
687
cdef public:
@@ -656,6 +713,10 @@ cdef class Factorizer:
656
713
self .count = len (counts)
657
714
return labels, counts
658
715
716
+ def unique (self , ndarray[object] values ):
717
+ # just for fun
718
+ return self .table.unique(values)
719
+
659
720
cdef class Int64Factorizer:
660
721
661
722
cdef public:
@@ -753,6 +814,34 @@ cdef class DictFactorizer:
753
814
self .count = len (counts)
754
815
return labels, counts
755
816
817
+ def unique (self , ndarray[object] values ):
818
+ cdef:
819
+ Py_ssize_t i, n = len (values)
820
+ Py_ssize_t idx, count = self .count
821
+ object val
822
+
823
+ for i in range (n):
824
+ val = values[i]
825
+ if val not in self .table:
826
+ self .table[val] = count
827
+ self .uniques.append(val)
828
+ count += 1
829
+ return self .uniques
830
+
831
+
832
+ def unique_int64 (self , ndarray[int64_t] values ):
833
+ cdef:
834
+ Py_ssize_t i, n = len (values)
835
+ Py_ssize_t idx, count = self .count
836
+ int64_t val
837
+
838
+ for i in range (n):
839
+ val = values[i]
840
+ if val not in self .table:
841
+ self .table[val] = count
842
+ self .uniques.append(val)
843
+ count += 1
844
+ return self .uniques
756
845
757
846
def lookup_locations2 (ndarray[object] values ):
758
847
cdef:
0 commit comments