@@ -194,6 +194,7 @@ EI_IMPULSE_ERROR run_nn_inference_tflite_full(
194
194
#include < string>
195
195
#include < filesystem>
196
196
#include < stdlib.h>
197
+ #include < map>
197
198
#include " tflite/linux-jetson-nano/libeitrt.h"
198
199
199
200
#if __APPLE__
@@ -202,7 +203,8 @@ EI_IMPULSE_ERROR run_nn_inference_tflite_full(
202
203
#include < linux/limits.h>
203
204
#endif
204
205
205
- EiTrt *ei_trt_handle = NULL ;
206
+ EiTrt* ei_trt_handle;
207
+ std::map<int ,bool > ei_trt_models_init;
206
208
207
209
inline bool file_exists (char *model_file_name)
208
210
{
@@ -217,6 +219,7 @@ inline bool file_exists(char *model_file_name)
217
219
218
220
EI_IMPULSE_ERROR write_model_to_file (
219
221
const ei_impulse_t *impulse,
222
+ uint32_t learn_block_index,
220
223
char *model_file_name,
221
224
const unsigned char *model,
222
225
size_t model_size,
@@ -251,20 +254,22 @@ EI_IMPULSE_ERROR write_model_to_file(
251
254
snprintf (
252
255
model_file_name,
253
256
PATH_MAX,
254
- " /tmp/ei-%d-%d.engine" ,
257
+ " /tmp/ei-%d-%d-%d .engine" ,
255
258
impulse->project_id ,
256
- impulse->deploy_version );
259
+ impulse->deploy_version ,
260
+ impulse->learning_blocks [learn_block_index].blockId );
257
261
}
258
262
else {
259
263
std::filesystem::path p (current_exe_path);
260
264
snprintf (
261
265
model_file_name,
262
266
PATH_MAX,
263
- " %s/%s-project%d-v%d.engine" ,
267
+ " %s/%s-project%d-v%d-%d .engine" ,
264
268
p.parent_path ().c_str (),
265
269
p.stem ().c_str (),
266
270
impulse->project_id ,
267
- impulse->deploy_version );
271
+ impulse->deploy_version ,
272
+ impulse->learning_blocks [learn_block_index].blockId );
268
273
}
269
274
270
275
bool fexists = file_exists (model_file_name);
@@ -323,30 +328,43 @@ EI_IMPULSE_ERROR run_nn_inference(
323
328
#error "TensorRT requires an unquantized network"
324
329
#endif
325
330
326
- static bool first_run = true ;
327
331
static char model_file_name[PATH_MAX];
328
- if (first_run) {
329
- write_model_to_file (impulse, model_file_name, graph_config->model , graph_config->model_size );
330
- first_run = false ;
331
- }
332
+ // writes the model file to filesystem (if and only it doesn't exist)
333
+ write_model_to_file (impulse, learn_block_index, model_file_name, graph_config->model , graph_config->model_size );
332
334
333
- float *out_data = (float *)ei_malloc (impulse->tflite_output_features_count * sizeof (float ));
334
- if (out_data == nullptr ) {
335
- ei_printf (" ERR: Cannot allocate memory for output data \n " );
335
+ // create context for building and executing TensorRT engine(s)
336
+ if (ei_trt_handle == nullptr ) {
337
+ ei_trt_handle = libeitrt::create_EiTrt (debug);
338
+ libeitrt::setMaxWorkspaceSize (ei_trt_handle, 1 <<29 ); // 512 MB
339
+
340
+ if (debug) {
341
+ ei_printf (" Using EI TensorRT lib v%d.%d.%d\r\n " , libeitrt::getMajorVersion (ei_trt_handle),
342
+ libeitrt::getMinorVersion (ei_trt_handle), libeitrt::getPatchVersion (ei_trt_handle));
343
+ }
336
344
}
337
345
338
- // lazy initialize tensorRT context
339
- if (ei_trt_handle == nullptr ) {
340
- ei_trt_handle = libeitrt::create_EiTrt (model_file_name, debug);
346
+ // lazy initialize TensorRT models and warmup only once per model
347
+ if (ei_trt_models_init.count (learn_block_index) <= 0 ) {
348
+ libeitrt::build (ei_trt_handle, learn_block_index, model_file_name);
349
+ libeitrt::warmUp (ei_trt_handle, learn_block_index, 200 );
350
+ ei_trt_models_init[learn_block_index] = true ;
341
351
}
342
352
353
+ int input_size = libeitrt::getInputSize (ei_trt_handle, learn_block_index);
354
+ int output_size = libeitrt::getOutputSize (ei_trt_handle, learn_block_index);
355
+
343
356
#if EI_CLASSIFIER_SINGLE_FEATURE_INPUT == 0
344
357
size_t mtx_size = impulse->dsp_blocks_size + impulse->learning_blocks_size ;
345
358
ei::matrix_t * matrix = NULL ;
346
359
347
- ei::matrix_t combined_matrix (1 , impulse->nn_input_frame_size );
348
- uint32_t buf_pos = 0 ;
360
+ size_t combined_matrix_size = get_feature_size (fmatrix, input_block_ids_size, input_block_ids, mtx_size);
361
+ if ((input_size >= 0 ) && ((size_t )input_size != combined_matrix_size)) {
362
+ ei_printf (" ERR: Invalid input features size, %ld given (expected: %d)\n " , combined_matrix_size, input_size);
363
+ return EI_IMPULSE_INVALID_SIZE;
364
+ }
365
+ ei::matrix_t combined_matrix (1 , combined_matrix_size);
349
366
367
+ uint32_t buf_pos = 0 ;
350
368
for (size_t i = 0 ; i < input_block_ids_size; i++) {
351
369
size_t cur_mtx = input_block_ids[i];
352
370
@@ -364,26 +382,38 @@ EI_IMPULSE_ERROR run_nn_inference(
364
382
ei::matrix_t * matrix = fmatrix[0 ].matrix ;
365
383
#endif
366
384
367
- uint64_t ctx_start_us = ei_read_timer_us ();
385
+ // copy input data to gpu
386
+ libeitrt::copyInputToDevice (ei_trt_handle, learn_block_index, matrix->buffer ,
387
+ input_size * sizeof (float ));
368
388
369
- libeitrt::infer (ei_trt_handle, matrix-> buffer , out_data, impulse-> tflite_output_features_count );
389
+ libeitrt::infer (ei_trt_handle, learn_block_index );
370
390
371
- uint64_t ctx_end_us = ei_read_timer_us ();
391
+ float *out_data = (float *)ei_malloc (output_size * sizeof (float ));
392
+ if (out_data == nullptr ) {
393
+ ei_printf (" ERR: Cannot allocate memory for output data \n " );
394
+ return EI_IMPULSE_ALLOC_FAILED;
395
+ }
372
396
373
- result->timing .classification_us = ctx_end_us - ctx_start_us;
397
+ // copy output data from gpu
398
+ libeitrt::copyOutputToHost (ei_trt_handle, learn_block_index, out_data,
399
+ output_size * sizeof (float ));
400
+
401
+
402
+ // get inference time
403
+ result->timing .classification_us = libeitrt::getInferenceUs (ei_trt_handle, learn_block_index);
374
404
result->timing .classification = (int )(result->timing .classification_us / 1000 );
375
405
376
406
if (result->copy_output ) {
377
407
matrix_t *output_matrix = fmatrix[impulse->dsp_blocks_size + learn_block_index].matrix ;
378
408
const size_t matrix_els = output_matrix->rows * output_matrix->cols ;
379
409
380
- if (impulse-> tflite_output_features_count != matrix_els) {
381
- ei_printf (" ERR: output tensor has size %d, but input matrix has has size %d\n " ,
382
- impulse-> tflite_output_features_count , (int )matrix_els);
410
+ if ((output_size >= 0 ) && (( size_t )output_size != matrix_els) ) {
411
+ ei_printf (" ERR: output tensor has size %d, but input matrix has size %d\n " ,
412
+ output_size , (int )matrix_els);
383
413
ei_free (out_data);
384
414
return EI_IMPULSE_INVALID_SIZE;
385
415
}
386
- memcpy (output_matrix->buffer , out_data, impulse-> tflite_output_features_count * sizeof (float ));
416
+ memcpy (output_matrix->buffer , out_data, output_size * sizeof (float ));
387
417
ei_free (out_data);
388
418
return EI_IMPULSE_OK;
389
419
}
@@ -445,6 +475,16 @@ EI_IMPULSE_ERROR run_nn_inference(
445
475
debug);
446
476
break ;
447
477
}
478
+ case EI_CLASSIFIER_LAST_LAYER_YOLO_PRO: {
479
+ fill_res = fill_result_struct_f32_yolo_pro (
480
+ impulse,
481
+ block_config,
482
+ result,
483
+ out_data,
484
+ impulse->tflite_output_features_count ,
485
+ debug);
486
+ break ;
487
+ }
448
488
default : {
449
489
ei_printf (
450
490
" ERR: Unsupported object detection last layer (%d)\n " ,
0 commit comments