@@ -1114,206 +1114,6 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
1114
1114
kh_destroy_int64(table)
1115
1115
return out
1116
1116
1117
- @ cython.wraparound (False )
1118
- @ cython.boundscheck (False )
1119
- def recategorize_int64 (list codes , list cats , int N , int recode_size ):
1120
- cdef:
1121
- kh_int64_t * table = kh_init_int64()
1122
- int64_t[:] new_codes = np.empty(N, dtype = ' int64' )
1123
- int64_t[:] recode = np.empty(recode_size, dtype = ' int64' )
1124
- int64_t[:] current_codes
1125
- int64_t[:] new_categories, current_categories
1126
- Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1127
- int ret = 0
1128
- int64_t current_code = 0
1129
- khiter_t k
1130
-
1131
- for cat_id in range (len (codes)):
1132
- current_codes = codes[cat_id]
1133
- current_categories = cats[cat_id]
1134
-
1135
- with nogil:
1136
- n_cats = current_categories.shape[0 ]
1137
- n_codes = current_codes.shape[0 ]
1138
- if cat_id == 0 :
1139
- kh_resize_int64(table, n_cats)
1140
- # first pass dump directly in to table since uniqueness
1141
- # is guaranteed
1142
- for j in range (n_cats):
1143
- k = kh_put_int64(table, current_categories[j], & ret)
1144
- table.vals[k] = current_code
1145
- current_code += 1
1146
- # reuse codes
1147
- for j in range (n_codes):
1148
- new_codes[i] = current_codes[j]
1149
- i += 1
1150
- else :
1151
- for j in range (n_cats):
1152
- k = kh_get_int64(table, current_categories[j])
1153
-
1154
- # if a new category, add to the master hash table
1155
- if k == table.n_buckets:
1156
- k = kh_put_int64(table, current_categories[j], & ret)
1157
- table.vals[k] = current_code
1158
- current_code += 1
1159
- # add to the recode table, mapping from
1160
- # orig catgory -> master_category
1161
- recode[j] = table.vals[k]
1162
-
1163
- for j in range (n_codes):
1164
- # continue filing new codes, this pass
1165
- # looking up in recode table
1166
- if current_codes[j] == - 1 :
1167
- new_codes[i] = - 1
1168
- else :
1169
- new_codes[i] = recode[current_codes[j]]
1170
- i += 1
1171
-
1172
- # fill in new categories from hash table
1173
- i = 0
1174
- new_categories = np.zeros(table.n_occupied, dtype = ' int64' )
1175
- with nogil:
1176
- for k in range (table.n_buckets):
1177
- if kh_exist_int64(table, k):
1178
- new_categories[i] = table.keys[k]
1179
- i += 1
1180
- kh_destroy_int64(table)
1181
- return np.asarray(new_codes), np.asarray(new_categories)
1182
-
1183
- # this could be fused with the int version
1184
- # but no great way to work with hash table
1185
- @ cython.wraparound (False )
1186
- @ cython.boundscheck (False )
1187
- def recategorize_float64 (list codes , list cats , int N , int recode_size ):
1188
- cdef:
1189
- kh_float64_t * table = kh_init_float64()
1190
- int64_t[:] new_codes = np.empty(N, dtype = ' int64' )
1191
- int64_t[:] recode = np.empty(recode_size, dtype = ' int64' )
1192
- int64_t[:] current_codes
1193
- float64_t[:] new_categories, current_categories
1194
- Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1195
- int ret = 0
1196
- int64_t current_code = 0
1197
- khiter_t k
1198
-
1199
- for cat_id in range (len (codes)):
1200
- current_codes = codes[cat_id]
1201
- current_categories = cats[cat_id]
1202
-
1203
- with nogil:
1204
- n_cats = current_categories.shape[0 ]
1205
- n_codes = current_codes.shape[0 ]
1206
- if cat_id == 0 :
1207
- # first pass dump directly in, since uniqueness is guaranteed
1208
- # and don't need to recode
1209
- kh_resize_float64(table, n_cats)
1210
- for j in range (n_cats):
1211
- k = kh_put_float64(table, current_categories[j], & ret)
1212
- table.vals[k] = current_code
1213
- current_code += 1
1214
- for j in range (n_codes):
1215
- new_codes[i] = current_codes[j]
1216
- i += 1
1217
- else :
1218
- for j in range (n_cats):
1219
- k = kh_get_float64(table, current_categories[j])
1220
-
1221
- # if a new category, add to the master hash table
1222
- if k == table.n_buckets:
1223
- k = kh_put_float64(table, current_categories[j], & ret)
1224
- table.vals[k] = current_code
1225
- current_code += 1
1226
-
1227
- # add to the recode table, mapping from
1228
- # orig_catgory -> master_category
1229
- recode[j] = table.vals[k]
1230
-
1231
- for j in range (n_codes):
1232
- if current_codes[j] == - 1 :
1233
- new_codes[i] = - 1
1234
- else :
1235
- new_codes[i] = recode[current_codes[j]]
1236
- i += 1
1237
-
1238
- # fill in new categories from hash table
1239
- i = 0
1240
- new_categories = np.zeros(table.n_occupied, dtype = ' float64' )
1241
- with nogil:
1242
- for k in range (table.n_buckets):
1243
- if kh_exist_float64(table, k):
1244
- new_categories[i] = table.keys[k]
1245
- i += 1
1246
- kh_destroy_float64(table)
1247
- return np.asarray(new_codes), np.asarray(new_categories)
1248
-
1249
-
1250
- @ cython.wraparound (False )
1251
- @ cython.boundscheck (False )
1252
- def recategorize_object (list codes , list cats , int N , int recode_size ):
1253
- cdef:
1254
- kh_pymap_t * table = kh_init_pymap()
1255
- int64_t[:] new_codes = np.empty(N, dtype = ' int64' )
1256
- int64_t[:] recode = np.empty(recode_size, dtype = ' int64' )
1257
- int64_t[:] current_codes
1258
- object [:] new_categories, current_categories
1259
- Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1260
- int ret = 0
1261
- int64_t current_code = 0
1262
- khiter_t k
1263
-
1264
- for cat_id in range (len (codes)):
1265
- current_codes = codes[cat_id]
1266
- current_categories = cats[cat_id]
1267
-
1268
- n_cats = current_categories.shape[0 ]
1269
- n_codes = current_codes.shape[0 ]
1270
- if cat_id == 0 :
1271
- kh_resize_pymap(table, n_cats)
1272
- # first pass dump directly in to table since uniqueness
1273
- # is guaranteed and don't need to recode
1274
- for j in range (n_cats):
1275
- k = kh_put_pymap(table, < PyObject * > current_categories[j], & ret)
1276
- table.vals[k] = current_code
1277
- current_code += 1
1278
- with nogil:
1279
- # reuse codes
1280
- for j in range (n_codes):
1281
- new_codes[i] = current_codes[j]
1282
- i += 1
1283
- else :
1284
- for j in range (n_cats):
1285
- k = kh_get_pymap(table, < PyObject* > current_categories[j])
1286
-
1287
- # if a new category, add to the master hash table
1288
- if k == table.n_buckets:
1289
- k = kh_put_pymap(table, < PyObject* > current_categories[j], & ret)
1290
- table.vals[k] = current_code
1291
- current_code += 1
1292
-
1293
- # add to the recode table, mapping from
1294
- # orig catgory -> master_category
1295
- recode[j] = table.vals[k]
1296
-
1297
- with nogil:
1298
- for j in range (n_codes):
1299
- # continue filing new codes, this pass
1300
- # looking up in recode table
1301
- if current_codes[j] == - 1 :
1302
- new_codes[i] = - 1
1303
- else :
1304
- new_codes[i] = recode[current_codes[j]]
1305
- i += 1
1306
-
1307
- # fill in new categories from hash table
1308
- i = 0
1309
- new_categories = np.zeros(table.n_occupied, dtype = ' object' )
1310
- for k in range (table.n_buckets):
1311
- if kh_exist_pymap(table, k):
1312
- new_categories[i] = < object > table.keys[k]
1313
- i += 1
1314
- kh_destroy_pymap(table)
1315
- return np.asarray(new_codes), np.asarray(new_categories)
1316
-
1317
1117
1318
1118
@ cython.wraparound (False )
1319
1119
@ cython.boundscheck (False )
0 commit comments