@@ -1114,6 +1114,206 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
1114
1114
kh_destroy_int64(table)
1115
1115
return out
1116
1116
1117
+ @ cython.wraparound (False )
1118
+ @ cython.boundscheck (False )
1119
+ def recategorize_int64 (list codes , list cats , int N , int recode_size ):
1120
+ cdef:
1121
+ kh_int64_t * table = kh_init_int64()
1122
+ int64_t[:] new_codes = np.empty(N, dtype = ' int64' )
1123
+ int64_t[:] recode = np.empty(recode_size, dtype = ' int64' )
1124
+ int64_t[:] current_codes
1125
+ int64_t[:] new_categories, current_categories
1126
+ Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1127
+ int ret = 0
1128
+ int64_t current_code = 0
1129
+ khiter_t k
1130
+
1131
+ for cat_id in range (len (codes)):
1132
+ current_codes = codes[cat_id]
1133
+ current_categories = cats[cat_id]
1134
+
1135
+ with nogil:
1136
+ n_cats = current_categories.shape[0 ]
1137
+ n_codes = current_codes.shape[0 ]
1138
+ if cat_id == 0 :
1139
+ kh_resize_int64(table, n_cats)
1140
+ # first pass dump directly in to table since uniqueness
1141
+ # is guaranteed
1142
+ for j in range (n_cats):
1143
+ k = kh_put_int64(table, current_categories[j], & ret)
1144
+ table.vals[k] = current_code
1145
+ current_code += 1
1146
+ # reuse codes
1147
+ for j in range (n_codes):
1148
+ new_codes[i] = current_codes[j]
1149
+ i += 1
1150
+ else :
1151
+ for j in range (n_cats):
1152
+ k = kh_get_int64(table, current_categories[j])
1153
+
1154
+ # if a new category, add to the master hash table
1155
+ if k == table.n_buckets:
1156
+ k = kh_put_int64(table, current_categories[j], & ret)
1157
+ table.vals[k] = current_code
1158
+ current_code += 1
1159
+ # add to the recode table, mapping from
1160
+ # orig catgory -> master_category
1161
+ recode[j] = table.vals[k]
1162
+
1163
+ for j in range (n_codes):
1164
+ # continue filing new codes, this pass
1165
+ # looking up in recode table
1166
+ if current_codes[j] == - 1 :
1167
+ new_codes[i] = - 1
1168
+ else :
1169
+ new_codes[i] = recode[current_codes[j]]
1170
+ i += 1
1171
+
1172
+ # fill in new categories from hash table
1173
+ i = 0
1174
+ new_categories = np.zeros(table.n_occupied, dtype = ' int64' )
1175
+ with nogil:
1176
+ for k in range (table.n_buckets):
1177
+ if kh_exist_int64(table, k):
1178
+ new_categories[i] = table.keys[k]
1179
+ i += 1
1180
+ kh_destroy_int64(table)
1181
+ return np.asarray(new_codes), np.asarray(new_categories)
1182
+
1183
+ # this could be fused with the int version
1184
+ # but no great way to work with hash table
1185
+ @ cython.wraparound (False )
1186
+ @ cython.boundscheck (False )
1187
+ def recategorize_float64 (list codes , list cats , int N , int recode_size ):
1188
+ cdef:
1189
+ kh_float64_t * table = kh_init_float64()
1190
+ int64_t[:] new_codes = np.empty(N, dtype = ' int64' )
1191
+ int64_t[:] recode = np.empty(recode_size, dtype = ' int64' )
1192
+ int64_t[:] current_codes
1193
+ float64_t[:] new_categories, current_categories
1194
+ Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1195
+ int ret = 0
1196
+ int64_t current_code = 0
1197
+ khiter_t k
1198
+
1199
+ for cat_id in range (len (codes)):
1200
+ current_codes = codes[cat_id]
1201
+ current_categories = cats[cat_id]
1202
+
1203
+ with nogil:
1204
+ n_cats = current_categories.shape[0 ]
1205
+ n_codes = current_codes.shape[0 ]
1206
+ if cat_id == 0 :
1207
+ # first pass dump directly in, since uniqueness is guaranteed
1208
+ # and don't need to recode
1209
+ kh_resize_float64(table, n_cats)
1210
+ for j in range (n_cats):
1211
+ k = kh_put_float64(table, current_categories[j], & ret)
1212
+ table.vals[k] = current_code
1213
+ current_code += 1
1214
+ for j in range (n_codes):
1215
+ new_codes[i] = current_codes[j]
1216
+ i += 1
1217
+ else :
1218
+ for j in range (n_cats):
1219
+ k = kh_get_float64(table, current_categories[j])
1220
+
1221
+ # if a new category, add to the master hash table
1222
+ if k == table.n_buckets:
1223
+ k = kh_put_float64(table, current_categories[j], & ret)
1224
+ table.vals[k] = current_code
1225
+ current_code += 1
1226
+
1227
+ # add to the recode table, mapping from
1228
+ # orig_catgory -> master_category
1229
+ recode[j] = table.vals[k]
1230
+
1231
+ for j in range (n_codes):
1232
+ if current_codes[j] == - 1 :
1233
+ new_codes[i] = - 1
1234
+ else :
1235
+ new_codes[i] = recode[current_codes[j]]
1236
+ i += 1
1237
+
1238
+ # fill in new categories from hash table
1239
+ i = 0
1240
+ new_categories = np.zeros(table.n_occupied, dtype = ' float64' )
1241
+ with nogil:
1242
+ for k in range (table.n_buckets):
1243
+ if kh_exist_float64(table, k):
1244
+ new_categories[i] = table.keys[k]
1245
+ i += 1
1246
+ kh_destroy_float64(table)
1247
+ return np.asarray(new_codes), np.asarray(new_categories)
1248
+
1249
+
1250
+ @ cython.wraparound (False )
1251
+ @ cython.boundscheck (False )
1252
+ def recategorize_object (list codes , list cats , int N , int recode_size ):
1253
+ cdef:
1254
+ kh_pymap_t * table = kh_init_pymap()
1255
+ int64_t[:] new_codes = np.empty(N, dtype = ' int64' )
1256
+ int64_t[:] recode = np.empty(recode_size, dtype = ' int64' )
1257
+ int64_t[:] current_codes
1258
+ object [:] new_categories, current_categories
1259
+ Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1260
+ int ret = 0
1261
+ int64_t current_code = 0
1262
+ khiter_t k
1263
+
1264
+ for cat_id in range (len (codes)):
1265
+ current_codes = codes[cat_id]
1266
+ current_categories = cats[cat_id]
1267
+
1268
+ n_cats = current_categories.shape[0 ]
1269
+ n_codes = current_codes.shape[0 ]
1270
+ if cat_id == 0 :
1271
+ kh_resize_pymap(table, n_cats)
1272
+ # first pass dump directly in to table since uniqueness
1273
+ # is guaranteed and don't need to recode
1274
+ for j in range (n_cats):
1275
+ k = kh_put_pymap(table, < PyObject * > current_categories[j], & ret)
1276
+ table.vals[k] = current_code
1277
+ current_code += 1
1278
+ with nogil:
1279
+ # reuse codes
1280
+ for j in range (n_codes):
1281
+ new_codes[i] = current_codes[j]
1282
+ i += 1
1283
+ else :
1284
+ for j in range (n_cats):
1285
+ k = kh_get_pymap(table, < PyObject* > current_categories[j])
1286
+
1287
+ # if a new category, add to the master hash table
1288
+ if k == table.n_buckets:
1289
+ k = kh_put_pymap(table, < PyObject* > current_categories[j], & ret)
1290
+ table.vals[k] = current_code
1291
+ current_code += 1
1292
+
1293
+ # add to the recode table, mapping from
1294
+ # orig catgory -> master_category
1295
+ recode[j] = table.vals[k]
1296
+
1297
+ with nogil:
1298
+ for j in range (n_codes):
1299
+ # continue filing new codes, this pass
1300
+ # looking up in recode table
1301
+ if current_codes[j] == - 1 :
1302
+ new_codes[i] = - 1
1303
+ else :
1304
+ new_codes[i] = recode[current_codes[j]]
1305
+ i += 1
1306
+
1307
+ # fill in new categories from hash table
1308
+ i = 0
1309
+ new_categories = np.zeros(table.n_occupied, dtype = ' object' )
1310
+ for k in range (table.n_buckets):
1311
+ if kh_exist_pymap(table, k):
1312
+ new_categories[i] = < object > table.keys[k]
1313
+ i += 1
1314
+ kh_destroy_pymap(table)
1315
+ return np.asarray(new_codes), np.asarray(new_categories)
1316
+
1117
1317
1118
1318
@ cython.wraparound (False )
1119
1319
@ cython.boundscheck (False )
0 commit comments