@@ -6,26 +6,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
6
6
7
7
{{py:
8
8
9
- # dtype, ttype, c_type, to_c_type, to_dtype
10
- dtypes = [('complex128 ', 'complex128', 'khcomplex128_t ',
11
- 'to_khcomplex128_t ', 'to_complex128 '),
12
- ('complex64 ', 'complex64', 'khcomplex64_t ',
13
- 'to_khcomplex64_t ', 'to_complex64 '),
14
- ('float64 ', 'float64', 'float64_t ', '', ''),
15
- ('float32 ', 'float32', 'float32_t ', '', ''),
16
- ('uint64 ', 'uint64', 'uint64_t ', '', ''),
17
- ('uint32 ', 'uint32', 'uint32_t ', '', ''),
18
- ('uint16 ', 'uint16', 'uint16_t ', '', ''),
19
- ('uint8 ', 'uint8', 'uint8_t ', '', ''),
20
- ('object ', 'pymap ', 'object ', '', ''),
21
- ('int64 ', 'int64', 'int64_t ', '', ''),
22
- ('int32 ', 'int32', 'int32_t ', '', ''),
23
- ('int16 ', 'int16', 'int16_t ', '', ''),
24
- ('int8 ', 'int8', 'int8_t ', '', '')]
9
+ # name, dtype, ttype, c_type, to_c_type
10
+ dtypes = [('Complex128 ', 'complex128', 'complex128 ',
11
+ 'khcomplex128_t ', 'to_khcomplex128_t '),
12
+ ('Complex64 ', 'complex64', 'complex64 ',
13
+ 'khcomplex64_t ', 'to_khcomplex64_t '),
14
+ ('Float64 ', 'float64', 'float64 ', 'float64_t ', ''),
15
+ ('Float32 ', 'float32', 'float32 ', 'float32_t ', ''),
16
+ ('UInt64 ', 'uint64', 'uint64 ', 'uint64_t ', ''),
17
+ ('UInt32 ', 'uint32', 'uint32 ', 'uint32_t ', ''),
18
+ ('UInt16 ', 'uint16', 'uint16 ', 'uint16_t ', ''),
19
+ ('UInt8 ', 'uint8', 'uint8 ', 'uint8_t ', ''),
20
+ ('Object ', 'object ', 'pymap ', 'object ', ''),
21
+ ('Int64 ', 'int64', 'int64 ', 'int64_t ', ''),
22
+ ('Int32 ', 'int32', 'int32 ', 'int32_t ', ''),
23
+ ('Int16 ', 'int16', 'int16 ', 'int16_t ', ''),
24
+ ('Int8 ', 'int8', 'int8 ', 'int8_t ', '')]
25
25
26
26
}}
27
27
28
- {{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}}
28
+ {{for name, dtype, ttype, c_type, to_c_type in dtypes}}
29
29
30
30
31
31
@cython.wraparound(False)
@@ -77,54 +77,77 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values,
77
77
@cython.wraparound(False)
78
78
@cython.boundscheck(False)
79
79
{{if dtype == 'object'}}
80
- cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
80
+ cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN ):
81
81
{{else}}
82
82
cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
83
83
{{endif}}
84
84
cdef:
85
85
Py_ssize_t i = 0
86
+ Py_ssize_t n = len(values)
87
+ size_t unique_key_index = 0
88
+ size_t unique_key_count = 0
86
89
kh_{{ttype}}_t *table
87
90
88
- {{if dtype != 'object'}}
89
- {{dtype}}_t[:] result_keys
90
- int64_t[:] result_counts
91
- {{endif}}
92
-
93
91
# Don't use Py_ssize_t, since table.n_buckets is unsigned
94
92
khiter_t k
93
+ bint is_null
94
+
95
+ {{c_type}} val
96
+
97
+ int ret = 0
98
+
99
+ # we track the order in which keys are first seen (GH39009),
100
+ # khash-map isn't insertion-ordered, thus:
101
+ # table maps key to index_of_appearence
102
+ # result_keys maps index_of_appearence to key
103
+ # result_counts maps index_of_appearence to number of elements
95
104
105
+ result_keys = {{name}}Vector()
106
+ result_counts = Int64Vector()
96
107
table = kh_init_{{ttype}}()
108
+
97
109
{{if dtype == 'object'}}
98
- build_count_table_{{dtype}}(values, table, 1)
110
+ kh_resize_{{ttype}}(table, n // 10)
111
+
112
+ for i in range(n):
113
+ val = values[i]
114
+ is_null = checknull(val)
115
+ if not is_null or not dropna:
116
+ # all nas become the same representative:
117
+ if is_null:
118
+ val = navalue
119
+ k = kh_get_{{ttype}}(table, <PyObject*>val)
120
+ if k != table.n_buckets:
121
+ unique_key_index = table.vals[k]
122
+ result_counts.data.data[unique_key_index] += 1
123
+ else:
124
+ k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
125
+ table.vals[k] = unique_key_count
126
+ result_keys.append(val)
127
+ result_counts.append(1)
128
+ unique_key_count+=1
99
129
{{else}}
100
- build_count_table_{{dtype}}(values, table, dropna)
101
- {{endif}}
130
+ kh_resize_{{ttype}}(table, n)
102
131
103
- result_keys = np.empty(table.n_occupied, '{{dtype}}')
104
- result_counts = np.zeros(table.n_occupied, dtype=np.int64 )
132
+ for i in range(n):
133
+ val = {{to_c_type}}(values[i] )
105
134
106
- {{if dtype == 'object'}}
107
- for k in range(table.n_buckets):
108
- if kh_exist_{{ttype}}(table, k):
109
- result_keys[i] = <{{dtype}}>table.keys[k]
110
- result_counts[i] = table.vals[k]
111
- i += 1
112
- {{else}}
113
- with nogil:
114
- for k in range(table.n_buckets):
115
- if kh_exist_{{ttype}}(table, k):
116
- result_keys[i] = {{to_dtype}}(table.keys[k])
117
- result_counts[i] = table.vals[k]
118
- i += 1
135
+ if not is_nan_{{c_type}}(val) or not dropna:
136
+ k = kh_get_{{ttype}}(table, val)
137
+ if k != table.n_buckets:
138
+ unique_key_index = table.vals[k]
139
+ result_counts.data.data[unique_key_index] += 1
140
+ else:
141
+ k = kh_put_{{ttype}}(table, val, &ret)
142
+ table.vals[k] = unique_key_count
143
+ result_keys.append(val)
144
+ result_counts.append(1)
145
+ unique_key_count+=1
119
146
{{endif}}
120
147
121
148
kh_destroy_{{ttype}}(table)
122
149
123
- {{if dtype == 'object'}}
124
- return result_keys, result_counts
125
- {{else}}
126
- return np.asarray(result_keys), np.asarray(result_counts)
127
- {{endif}}
150
+ return result_keys.to_array(), result_counts.to_array()
128
151
129
152
130
153
@cython.wraparound(False)
0 commit comments