@@ -356,64 +356,21 @@ cdef class {{name}}HashTable(HashTable):
356
356
return np.asarray(locs)
357
357
358
358
@cython.boundscheck(False)
359
- def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
359
+ def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
360
+ bint ignore_na=False, bint return_inverse=False,
361
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
362
+ object na_value=None):
360
363
cdef:
361
- Py_ssize_t i, idx, count = 0 , n = len(values)
364
+ Py_ssize_t i, idx, count = count_prior , n = len(values)
362
365
int64_t[:] labels
363
366
int ret = 0
364
- {{dtype}}_t val
365
- khiter_t k
366
- {{name}}Vector uniques = {{name}}Vector()
367
- {{name}}VectorData *ud
368
-
369
- ud = uniques.data
370
- if return_inverse:
371
- labels = np.empty(n, dtype=np.int64)
372
-
373
- with nogil:
374
- for i in range(n):
375
- val = values[i]
376
- k = kh_get_{{dtype}}(self.table, val)
377
- if return_inverse and k != self.table.n_buckets:
378
- # k falls into a previous bucket
379
- idx = self.table.vals[k]
380
- labels[i] = idx
381
- elif k == self.table.n_buckets:
382
- # k hasn't been seen yet
383
- k = kh_put_{{dtype}}(self.table, val, &ret)
384
- if needs_resize(ud):
385
- with gil:
386
- uniques.resize()
387
- append_data_{{dtype}}(ud, val)
388
- if return_inverse:
389
- self.table.vals[k] = count
390
- labels[i] = count
391
- count += 1
392
-
393
- if return_inverse:
394
- return uniques.to_array(), np.asarray(labels)
395
- return uniques.to_array()
396
-
397
- def factorize(self, {{dtype}}_t[:] values):
398
- uniques = {{name}}Vector()
399
- labels = self.get_labels(values, uniques, 0)
400
- return uniques.to_array(), labels
401
-
402
- @cython.boundscheck(False)
403
- def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
404
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
405
- object na_value=None):
406
- cdef:
407
- Py_ssize_t i, n = len(values)
408
- int64_t[:] labels
409
- Py_ssize_t idx, count = count_prior
410
- int ret = 0
411
367
{{dtype}}_t val, na_value2
412
368
khiter_t k
413
369
{{name}}VectorData *ud
414
370
bint use_na_value
415
371
416
- labels = np.empty(n, dtype=np.int64)
372
+ if return_inverse:
373
+ labels = np.empty(n, dtype=np.int64)
417
374
ud = uniques.data
418
375
use_na_value = na_value is not None
419
376
@@ -431,21 +388,19 @@ cdef class {{name}}HashTable(HashTable):
431
388
for i in range(n):
432
389
val = values[i]
433
390
434
- if val != val or (use_na_value and val == na_value2):
391
+ if ignore_na and (val != val
392
+ or (use_na_value and val == na_value2)):
435
393
labels[i] = na_sentinel
436
394
continue
437
395
438
396
k = kh_get_{{dtype}}(self.table, val)
439
-
440
- if k != self.table.n_buckets:
397
+ if return_inverse and k != self.table.n_buckets:
441
398
# k falls into a previous bucket
442
399
idx = self.table.vals[k]
443
400
labels[i] = idx
444
- else :
401
+ elif k == self.table.n_buckets :
445
402
# k hasn't been seen yet
446
403
k = kh_put_{{dtype}}(self.table, val, &ret)
447
- self.table.vals[k] = count
448
-
449
404
if needs_resize(ud):
450
405
with gil:
451
406
if uniques.external_view_exists:
@@ -454,10 +409,30 @@ cdef class {{name}}HashTable(HashTable):
454
409
"Vector.resize() needed")
455
410
uniques.resize()
456
411
append_data_{{dtype}}(ud, val)
457
- labels[i] = count
412
+ if return_inverse:
413
+ self.table.vals[k] = count
414
+ labels[i] = count
458
415
count += 1
459
416
460
- return np.asarray(labels)
417
+ if return_inverse:
418
+ return uniques.to_array(), np.asarray(labels)
419
+ return uniques.to_array()
420
+
421
+ def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
422
+ return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
423
+ return_inverse=return_inverse)
424
+
425
+ def factorize(self, {{dtype}}_t[:] values):
426
+ return self._unique(values, uniques={{name}}Vector(), ignore_na=True,
427
+ return_inverse=True)
428
+
429
+ def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
430
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
431
+ object na_value=None):
432
+ _, labels = self._unique(values, uniques, ignore_na=True,
433
+ return_inverse=True, count_prior=count_prior,
434
+ na_sentinel=na_sentinel, na_value=na_value)
435
+ return labels
461
436
462
437
@cython.boundscheck(False)
463
438
def get_labels_groupby(self, const {{dtype}}_t[:] values):
@@ -645,33 +620,45 @@ cdef class StringHashTable(HashTable):
645
620
free(vecs)
646
621
647
622
@cython.boundscheck(False)
648
- def unique(self, ndarray[object] values, bint return_inverse=False):
623
+ def _unique(self, ndarray[object] values, ObjectVector uniques,
624
+ bint ignore_na=False, bint return_inverse=False,
625
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
626
+ object na_value=None):
649
627
cdef:
650
- Py_ssize_t i, idx, count = 0 , n = len(values)
628
+ Py_ssize_t i, idx, count = count_prior , n = len(values)
651
629
int64_t[:] labels
652
630
int64_t[:] uindexer
653
631
int ret = 0
654
632
object val
655
- ObjectVector uniques = ObjectVector()
656
- khiter_t k
657
633
const char *v
658
634
const char **vecs
635
+ khiter_t k
636
+ bint use_na_value
659
637
660
638
if return_inverse:
661
639
labels = np.zeros(n, dtype=np.int64)
662
640
uindexer = np.empty(n, dtype=np.int64)
641
+ use_na_value = na_value is not None
663
642
664
- # assign pointers
643
+ # assign pointers and pre-filter out missing (if ignore_na)
665
644
vecs = <const char **> malloc(n * sizeof(char *))
666
645
for i in range(n):
667
646
val = values[i]
668
- v = util.get_c_string(val)
669
- vecs[i] = v
670
647
648
+ if not ignore_na or ((PyUnicode_Check(val) or PyString_Check(val))
649
+ and not (use_na_value and val == na_value)):
650
+ # if ignore_na is False, we also stringify NaN/None/etc.
651
+ v = util.get_c_string(val)
652
+ vecs[i] = v
653
+ else:
654
+ labels[i] = na_sentinel
671
655
672
656
# compute
673
657
with nogil:
674
658
for i in range(n):
659
+ if ignore_na and labels[i] == na_sentinel:
660
+ continue
661
+
675
662
v = vecs[i]
676
663
k = kh_get_str(self.table, v)
677
664
if return_inverse and k != self.table.n_buckets:
@@ -697,65 +684,21 @@ cdef class StringHashTable(HashTable):
697
684
return uniques.to_array(), np.asarray(labels)
698
685
return uniques.to_array()
699
686
700
- @cython.boundscheck(False)
701
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
702
- Py_ssize_t count_prior=0, int64_t na_sentinel=-1,
703
- object na_value=None):
704
- cdef:
705
- Py_ssize_t i, n = len(values)
706
- int64_t[:] labels
707
- int64_t[:] uindexer
708
- Py_ssize_t idx, count = count_prior
709
- int ret = 0
710
- object val
711
- const char *v
712
- const char **vecs
713
- khiter_t k
714
- bint use_na_value
715
-
716
- # these by-definition *must* be strings
717
- labels = np.zeros(n, dtype=np.int64)
718
- uindexer = np.empty(n, dtype=np.int64)
719
- use_na_value = na_value is not None
720
-
721
- # pre-filter out missing
722
- # and assign pointers
723
- vecs = <const char **> malloc(n * sizeof(char *))
724
- for i in range(n):
725
- val = values[i]
726
-
727
- if ((PyUnicode_Check(val) or PyString_Check(val)) and
728
- not (use_na_value and val == na_value)):
729
- v = util.get_c_string(val)
730
- vecs[i] = v
731
- else:
732
- labels[i] = na_sentinel
733
-
734
- # compute
735
- with nogil:
736
- for i in range(n):
737
- if labels[i] == na_sentinel:
738
- continue
739
-
740
- v = vecs[i]
741
- k = kh_get_str(self.table, v)
742
- if k != self.table.n_buckets:
743
- idx = self.table.vals[k]
744
- labels[i] = <int64_t>idx
745
- else:
746
- k = kh_put_str(self.table, v, &ret)
747
- self.table.vals[k] = count
748
- uindexer[count] = i
749
- labels[i] = <int64_t>count
750
- count += 1
751
-
752
- free(vecs)
687
+ def unique(self, ndarray[object] values, bint return_inverse=False):
688
+ return self._unique(values, uniques=ObjectVector(), ignore_na=False,
689
+ return_inverse=return_inverse)
753
690
754
- # uniques
755
- for i in range(count):
756
- uniques.append(values[uindexer[i]] )
691
+ def factorize(self, ndarray[object] values):
692
+ return self._unique(values, uniques=ObjectVector(), ignore_na=True,
693
+ return_inverse=True )
757
694
758
- return np.asarray(labels)
695
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
696
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
697
+ object na_value=None):
698
+ _, labels = self._unique(values, uniques, ignore_na=True,
699
+ return_inverse=True, count_prior=count_prior,
700
+ na_sentinel=na_sentinel, na_value=na_value)
701
+ return labels
759
702
760
703
761
704
cdef class PyObjectHashTable(HashTable):
@@ -844,21 +787,31 @@ cdef class PyObjectHashTable(HashTable):
844
787
return np.asarray(locs)
845
788
846
789
@cython.boundscheck(False)
847
- def unique(self, ndarray[object] values, bint return_inverse=False):
790
+ def _unique(self, ndarray[object] values, ObjectVector uniques,
791
+ bint ignore_na=False, bint return_inverse=False,
792
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
793
+ object na_value=None):
848
794
cdef:
849
- Py_ssize_t i, idx, count = 0 , n = len(values)
795
+ Py_ssize_t i, idx, count = count_prior , n = len(values)
850
796
int64_t[:] labels
851
797
int ret = 0
852
798
object val
853
799
khiter_t k
854
- ObjectVector uniques = ObjectVector()
800
+ bint use_na_value
855
801
856
802
if return_inverse:
857
803
labels = np.empty(n, dtype=np.int64)
804
+ use_na_value = na_value is not None
858
805
859
806
for i in range(n):
860
807
val = values[i]
861
808
hash(val)
809
+
810
+ if ignore_na and ((val != val or val is None)
811
+ or (use_na_value and val == na_value)):
812
+ labels[i] = na_sentinel
813
+ continue
814
+
862
815
k = kh_get_pymap(self.table, <PyObject*>val)
863
816
if return_inverse and k != self.table.n_buckets:
864
817
# k falls into a previous bucket
@@ -877,42 +830,18 @@ cdef class PyObjectHashTable(HashTable):
877
830
return uniques.to_array(), np.asarray(labels)
878
831
return uniques.to_array()
879
832
880
- @cython.boundscheck(False)
881
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
882
- Py_ssize_t count_prior=0, int64_t na_sentinel=-1,
883
- object na_value=None):
884
- cdef:
885
- Py_ssize_t i, n = len(values)
886
- int64_t[:] labels
887
- Py_ssize_t idx, count = count_prior
888
- int ret = 0
889
- object val
890
- khiter_t k
891
- bint use_na_value
892
-
893
- labels = np.empty(n, dtype=np.int64)
894
- use_na_value = na_value is not None
895
-
896
- for i in range(n):
897
- val = values[i]
898
- hash(val)
899
-
900
- if ((val != val or val is None) or
901
- (use_na_value and val == na_value)):
902
- labels[i] = na_sentinel
903
- continue
833
+ def unique(self, ndarray[object] values, bint return_inverse=False):
834
+ return self._unique(values, uniques=ObjectVector(), ignore_na=False,
835
+ return_inverse=return_inverse)
904
836
905
- k = kh_get_pymap(self.table, <PyObject*>val)
906
- if k != self.table.n_buckets:
907
- # k falls into a previous bucket
908
- idx = self.table.vals[k]
909
- labels[i] = idx
910
- else:
911
- # k hasn't been seen yet
912
- k = kh_put_pymap(self.table, <PyObject*>val, &ret)
913
- self.table.vals[k] = count
914
- uniques.append(val)
915
- labels[i] = count
916
- count += 1
837
+ def factorize(self, ndarray[object] values):
838
+ return self._unique(values, uniques=ObjectVector(), ignore_na=True,
839
+ return_inverse=True)
917
840
918
- return np.asarray(labels)
841
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
842
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
843
+ object na_value=None):
844
+ _, labels = self._unique(values, uniques, ignore_na=True,
845
+ return_inverse=True, count_prior=count_prior,
846
+ na_sentinel=na_sentinel, na_value=na_value)
847
+ return labels
0 commit comments