@@ -118,9 +118,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
118
118
@cython.wraparound(False)
119
119
@cython.boundscheck(False)
120
120
{{if dtype == 'object'}}
121
- cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
121
+ cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None ):
122
122
{{else}}
123
- cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
123
+ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None ):
124
124
{{endif}}
125
125
cdef:
126
126
int ret = 0
@@ -129,10 +129,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
129
129
{{else}}
130
130
PyObject* value
131
131
{{endif}}
132
- Py_ssize_t i, n = len(values)
132
+ Py_ssize_t i, n = len(values), first_na = -1
133
133
khiter_t k
134
134
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
135
135
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
136
+ bint seen_na = False, uses_mask = mask is not None
137
+ bint seen_multiple_na = False
136
138
137
139
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
138
140
@@ -147,9 +149,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
147
149
{{endif}}
148
150
for i in range(n - 1, -1, -1):
149
151
# equivalent: range(n)[::-1], which cython doesn't like in nogil
150
- value = {{to_c_type}}(values[i])
151
- kh_put_{{ttype}}(table, value, &ret)
152
- out[i] = ret == 0
152
+ if uses_mask and mask[i]:
153
+ if seen_na:
154
+ out[i] = True
155
+ else:
156
+ out[i] = False
157
+ seen_na = True
158
+ else:
159
+ value = {{to_c_type}}(values[i])
160
+ kh_put_{{ttype}}(table, value, &ret)
161
+ out[i] = ret == 0
153
162
154
163
elif keep == 'first':
155
164
{{if dtype == 'object'}}
@@ -158,9 +167,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
158
167
with nogil:
159
168
{{endif}}
160
169
for i in range(n):
161
- value = {{to_c_type}}(values[i])
162
- kh_put_{{ttype}}(table, value, &ret)
163
- out[i] = ret == 0
170
+ if uses_mask and mask[i]:
171
+ if seen_na:
172
+ out[i] = True
173
+ else:
174
+ out[i] = False
175
+ seen_na = True
176
+ else:
177
+ value = {{to_c_type}}(values[i])
178
+ kh_put_{{ttype}}(table, value, &ret)
179
+ out[i] = ret == 0
164
180
165
181
else:
166
182
{{if dtype == 'object'}}
@@ -169,15 +185,28 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
169
185
with nogil:
170
186
{{endif}}
171
187
for i in range(n):
172
- value = {{to_c_type}}(values[i])
173
- k = kh_get_{{ttype}}(table, value)
174
- if k != table.n_buckets:
175
- out[table.vals[k]] = 1
176
- out[i] = 1
188
+ if uses_mask and mask[i]:
189
+ if not seen_na:
190
+ first_na = i
191
+ seen_na = True
192
+ out[i] = 0
193
+ elif not seen_multiple_na:
194
+ out[i] = 1
195
+ out[first_na] = 1
196
+ seen_multiple_na = True
197
+ else:
198
+ out[i] = 1
199
+
177
200
else:
178
- k = kh_put_{{ttype}}(table, value, &ret)
179
- table.vals[k] = i
180
- out[i] = 0
201
+ value = {{to_c_type}}(values[i])
202
+ k = kh_get_{{ttype}}(table, value)
203
+ if k != table.n_buckets:
204
+ out[table.vals[k]] = 1
205
+ out[i] = 1
206
+ else:
207
+ k = kh_put_{{ttype}}(table, value, &ret)
208
+ table.vals[k] = i
209
+ out[i] = 0
181
210
182
211
kh_destroy_{{ttype}}(table)
183
212
return out
@@ -301,37 +330,37 @@ cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=N
301
330
raise TypeError(values.dtype)
302
331
303
332
304
- cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
333
+ cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None ):
305
334
if htfunc_t is object:
306
- return duplicated_object(values, keep)
335
+ return duplicated_object(values, keep, mask=mask )
307
336
308
337
elif htfunc_t is int8_t:
309
- return duplicated_int8(values, keep)
338
+ return duplicated_int8(values, keep, mask=mask )
310
339
elif htfunc_t is int16_t:
311
- return duplicated_int16(values, keep)
340
+ return duplicated_int16(values, keep, mask=mask )
312
341
elif htfunc_t is int32_t:
313
- return duplicated_int32(values, keep)
342
+ return duplicated_int32(values, keep, mask=mask )
314
343
elif htfunc_t is int64_t:
315
- return duplicated_int64(values, keep)
344
+ return duplicated_int64(values, keep, mask=mask )
316
345
317
346
elif htfunc_t is uint8_t:
318
- return duplicated_uint8(values, keep)
347
+ return duplicated_uint8(values, keep, mask=mask )
319
348
elif htfunc_t is uint16_t:
320
- return duplicated_uint16(values, keep)
349
+ return duplicated_uint16(values, keep, mask=mask )
321
350
elif htfunc_t is uint32_t:
322
- return duplicated_uint32(values, keep)
351
+ return duplicated_uint32(values, keep, mask=mask )
323
352
elif htfunc_t is uint64_t:
324
- return duplicated_uint64(values, keep)
353
+ return duplicated_uint64(values, keep, mask=mask )
325
354
326
355
elif htfunc_t is float64_t:
327
- return duplicated_float64(values, keep)
356
+ return duplicated_float64(values, keep, mask=mask )
328
357
elif htfunc_t is float32_t:
329
- return duplicated_float32(values, keep)
358
+ return duplicated_float32(values, keep, mask=mask )
330
359
331
360
elif htfunc_t is complex128_t:
332
- return duplicated_complex128(values, keep)
361
+ return duplicated_complex128(values, keep, mask=mask )
333
362
elif htfunc_t is complex64_t:
334
- return duplicated_complex64(values, keep)
363
+ return duplicated_complex64(values, keep, mask=mask )
335
364
336
365
else:
337
366
raise TypeError(values.dtype)
0 commit comments