@@ -35,13 +35,18 @@ cdef extern from "kvec.h":
35
35
size_t n, m
36
36
int64_t* a
37
37
38
+ ctypedef struct kv_double:
39
+ size_t n, m
40
+ double * a
41
+
38
42
ctypedef struct kv_object_t:
39
43
size_t n, m
40
44
PyObject** a
41
45
42
46
inline void kv_object_push(kv_object_t * v, PyObject* x)
43
47
inline void kv_object_destroy(kv_object_t * v)
44
48
inline void kv_int64_push(kv_int64_t * v, int64_t x)
49
+ inline void kv_double_push(kv_double * v, double x)
45
50
46
51
47
52
cdef class ObjectVector:
@@ -53,6 +58,9 @@ cdef class ObjectVector:
53
58
def __cinit__ (self ):
54
59
self .owndata = 1
55
60
61
+ def __len__ (self ):
62
+ return self .vec.n
63
+
56
64
def to_array (self , xfer_data = True ):
57
65
""" Here we use the __array__ method, that is called when numpy
58
66
tries to get an array from the object."""
@@ -68,9 +76,10 @@ cdef class ObjectVector:
68
76
69
77
# urgh, mingw32 barfs because of this
70
78
71
- # if xfer_data:
72
- # self.owndata = 0
73
- # util.set_array_owndata(result)
79
+ if xfer_data:
80
+ self .owndata = 0
81
+ util.set_array_owndata(result)
82
+
74
83
# return result
75
84
76
85
return result.copy()
@@ -92,6 +101,9 @@ cdef class Int64Vector:
92
101
def __cinit__ (self ):
93
102
self .owndata = 1
94
103
104
+ def __len__ (self ):
105
+ return self .vec.n
106
+
95
107
def to_array (self , xfer_data = True ):
96
108
""" Here we use the __array__ method, that is called when numpy
97
109
tries to get an array from the object."""
@@ -118,6 +130,44 @@ cdef class Int64Vector:
118
130
if self .owndata:
119
131
free(self .vec.a)
120
132
133
+ cdef class Float64Vector:
134
+
135
+ cdef:
136
+ bint owndata
137
+ kv_double vec
138
+
139
+ def __cinit__ (self ):
140
+ self .owndata = 1
141
+
142
+ def __len__ (self ):
143
+ return self .vec.n
144
+
145
+ def to_array (self , xfer_data = True ):
146
+ """ Here we use the __array__ method, that is called when numpy
147
+ tries to get an array from the object."""
148
+ cdef:
149
+ npy_intp shape[1 ]
150
+ ndarray result
151
+
152
+ shape[0 ] = < npy_intp> self .vec.n
153
+
154
+ # Create a 1D array, of length 'size'
155
+ result = PyArray_SimpleNewFromData(1 , shape, np.NPY_FLOAT64,
156
+ self .vec.a)
157
+
158
+ if xfer_data:
159
+ self .owndata = 0
160
+ util.set_array_owndata(result)
161
+
162
+ return result
163
+
164
+ cdef inline append(self , float64_t x):
165
+ kv_double_push(& self .vec, x)
166
+
167
+ def __dealloc__ (self ):
168
+ if self .owndata:
169
+ free(self .vec.a)
170
+
121
171
122
172
cdef class HashTable:
123
173
pass
@@ -197,7 +247,7 @@ cdef class StringHashTable(HashTable):
197
247
object val
198
248
char * buf
199
249
khiter_t k
200
- list uniques = []
250
+ ObjectVector uniques = ObjectVector()
201
251
202
252
for i in range (n):
203
253
val = values[i]
@@ -212,7 +262,7 @@ cdef class StringHashTable(HashTable):
212
262
uniques.append(val)
213
263
214
264
# return None
215
- return uniques
265
+ return uniques.to_array( xfer_data = True )
216
266
217
267
def factorize (self , ndarray[object] values ):
218
268
cdef:
@@ -471,7 +521,7 @@ cdef class Int64HashTable(HashTable):
471
521
labels, counts = self .get_labels(values, reverse, 0 )
472
522
return reverse, labels, counts
473
523
474
- def get_labels (self , ndarray[int64_t] values , list uniques ,
524
+ def get_labels (self , ndarray[int64_t] values , Int64Vector uniques ,
475
525
Py_ssize_t count_prior , Py_ssize_t na_sentinel ):
476
526
cdef:
477
527
Py_ssize_t i, n = len (values)
@@ -570,7 +620,6 @@ def value_count_int64(ndarray[int64_t] values):
570
620
Py_ssize_t i, n = len (values)
571
621
kh_int64_t * table
572
622
int ret = 0
573
- list uniques = []
574
623
575
624
table = kh_init_int64()
576
625
kh_resize_int64(table, n)
@@ -615,11 +664,12 @@ cdef class Float64HashTable(HashTable):
615
664
kh_destroy_float64(self .table)
616
665
617
666
def factorize (self , ndarray[float64_t] values ):
618
- uniques = []
667
+ uniques = Float64Vector()
619
668
labels, counts = self .get_labels(values, uniques, 0 , - 1 )
620
- return uniques, labels, counts
669
+ return uniques.to_array( xfer_data = True ) , labels, counts
621
670
622
- cpdef get_labels(self , ndarray[float64_t] values, list uniques,
671
+ cpdef get_labels(self , ndarray[float64_t] values,
672
+ Float64Vector uniques,
623
673
Py_ssize_t count_prior, int64_t na_sentinel):
624
674
cdef:
625
675
Py_ssize_t i, n = len (values)
@@ -690,7 +740,7 @@ cdef class Float64HashTable(HashTable):
690
740
int ret = 0
691
741
float64_t val
692
742
khiter_t k
693
- list uniques = []
743
+ Float64Vector uniques = Float64Vector()
694
744
bint seen_na = 0
695
745
696
746
# TODO: kvec
@@ -708,7 +758,7 @@ cdef class Float64HashTable(HashTable):
708
758
seen_na = 1
709
759
uniques.append(ONAN)
710
760
711
- return uniques
761
+ return uniques.to_array( xfer_data = True )
712
762
713
763
cdef class PyObjectHashTable(HashTable):
714
764
cdef kh_pymap_t * table
@@ -842,7 +892,7 @@ cdef class PyObjectHashTable(HashTable):
842
892
843
893
return result
844
894
845
- cpdef get_labels(self , ndarray[object ] values, list uniques,
895
+ cpdef get_labels(self , ndarray[object ] values, ObjectVector uniques,
846
896
Py_ssize_t count_prior, int64_t na_sentinel):
847
897
cdef:
848
898
Py_ssize_t i, n = len (values)
@@ -882,12 +932,12 @@ cdef class PyObjectHashTable(HashTable):
882
932
883
933
cdef class Factorizer:
884
934
cdef public PyObjectHashTable table
885
- cdef public uniques
935
+ cdef public ObjectVector uniques
886
936
cdef public Py_ssize_t count
887
937
888
938
def __init__ (self , size_hint ):
889
939
self .table = PyObjectHashTable(size_hint)
890
- self .uniques = []
940
+ self .uniques = ObjectVector()
891
941
self .count = 0
892
942
893
943
def get_count (self ):
@@ -902,7 +952,7 @@ cdef class Factorizer:
902
952
if labels.dtype != np.int_:
903
953
labels = labels.astype(np.int_)
904
954
905
- sorter = list_to_object_array( self .uniques).argsort()
955
+ sorter = self .uniques.to_array( xfer_data = False ).argsort()
906
956
reverse_indexer = np.empty(len (sorter), dtype = np.int_)
907
957
reverse_indexer.put(sorter, np.arange(len (sorter)))
908
958
@@ -919,12 +969,12 @@ cdef class Factorizer:
919
969
920
970
cdef class Int64Factorizer:
921
971
cdef public Int64HashTable table
922
- cdef public list uniques
972
+ cdef public Int64Vector uniques
923
973
cdef public Py_ssize_t count
924
974
925
975
def __init__ (self , size_hint ):
926
976
self .table = Int64HashTable(size_hint)
927
- self .uniques = []
977
+ self .uniques = Int64Vector()
928
978
self .count = 0
929
979
930
980
def get_count (self ):
@@ -940,7 +990,7 @@ cdef class Int64Factorizer:
940
990
if labels.dtype != np.int_:
941
991
labels = labels.astype(np.int_)
942
992
943
- sorter = list_to_object_array( self .uniques).argsort()
993
+ sorter = self .uniques.to_array( xfer_data = False ).argsort()
944
994
reverse_indexer = np.empty(len (sorter), dtype = np.int_)
945
995
reverse_indexer.put(sorter, np.arange(len (sorter)))
946
996
@@ -951,103 +1001,6 @@ cdef class Int64Factorizer:
951
1001
return labels, counts
952
1002
953
1003
954
- cdef class DictFactorizer:
955
-
956
- cdef public:
957
- dict table
958
- list uniques
959
- Py_ssize_t count
960
-
961
- def __init__ (self , table = None , uniques = None ):
962
- if table is None :
963
- self .table = {}
964
- else :
965
- self .table = table
966
-
967
- if uniques is None :
968
- self .uniques = []
969
- self .count = 0
970
- else :
971
- self .uniques = uniques
972
- self .count = len (uniques)
973
-
974
- def get_count (self ):
975
- return self .count
976
-
977
- def get_labels (self , ndarray[object] values ):
978
- cdef:
979
- Py_ssize_t i, n = len (values)
980
- ndarray[int64_t] labels
981
- ndarray[int64_t] counts
982
- Py_ssize_t idx, count = self .count
983
- int ret = 0
984
- object val
985
-
986
- labels = np.empty(n, dtype = np.int64)
987
- counts = np.empty(count + n, dtype = np.int64)
988
-
989
- for i in range (n):
990
- val = values[i]
991
-
992
- if val in self .table:
993
- idx = self .table[val]
994
- labels[i] = idx
995
- counts[idx] = counts[idx] + 1
996
- else :
997
- self .table[val] = count
998
- self .uniques.append(val)
999
- labels[i] = count
1000
- counts[count] = 1
1001
- count += 1
1002
-
1003
- return labels, counts[:count].copy()
1004
-
1005
- def factorize (self , ndarray[object] values , sort = False ):
1006
- labels, counts = self .get_labels(values)
1007
-
1008
- # sort on
1009
- if sort:
1010
- if labels.dtype != np.int_:
1011
- labels = labels.astype(np.int_)
1012
-
1013
- sorter = list_to_object_array(self .uniques).argsort()
1014
- reverse_indexer = np.empty(len (sorter), dtype = np.int_)
1015
- reverse_indexer.put(sorter, np.arange(len (sorter)))
1016
-
1017
- labels = reverse_indexer.take(labels)
1018
- counts = counts.take(sorter)
1019
-
1020
- self .count = len (counts)
1021
- return labels, counts
1022
-
1023
- def unique (self , ndarray[object] values ):
1024
- cdef:
1025
- Py_ssize_t i, n = len (values)
1026
- Py_ssize_t idx, count = self .count
1027
- object val
1028
-
1029
- for i in range (n):
1030
- val = values[i]
1031
- if val not in self .table:
1032
- self .table[val] = count
1033
- self .uniques.append(val)
1034
- count += 1
1035
- return self .uniques
1036
-
1037
-
1038
- def unique_int64 (self , ndarray[int64_t] values ):
1039
- cdef:
1040
- Py_ssize_t i, n = len (values)
1041
- Py_ssize_t idx, count = self .count
1042
- int64_t val
1043
-
1044
- for i in range (n):
1045
- val = values[i]
1046
- if val not in self .table:
1047
- self .table[val] = count
1048
- self .uniques.append(val)
1049
- count += 1
1050
- return self .uniques
1051
1004
1052
1005
def lookup2 (ndarray[object] values ):
1053
1006
cdef:
0 commit comments