ggml
ggml copied to clipboard
ggml inference time is significantly slower than onnxruntime
I use ggml to deploy the mobilenetv2 model, and compared with the deployment using onnxruntime, I found that the inference time of ggml is nearly 100 times that of onnxruntime. My ggml inference code part is as follows:
float * mobilenetv2_inference(ggml_tensor * input, mobilenetv2_model model, ggml_context * ctx0) {
ggml_tensor * result = apply_conv2d(ctx0, input, model.conv2d_layers[0]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[1]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[2]);
result = apply_conv2d(ctx0, result, model.conv2d_layers[3]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[4]);
ggml_tensor * result_res = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[5]);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[6]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[7]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[8]);
result = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result, model.conv2d_layers[9]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[10]);
result_res = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[11]);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[12]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[13]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[14]);
result_res = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[15]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[16]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[17]);
result = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result, model.conv2d_layers[18]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[19]);
result_res = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[20]);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[21]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[22]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[23]);
result_res = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result, model.conv2d_layers[24]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[25]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[26]);
result_res = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[27]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[28]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[29]);
result = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result, model.conv2d_layers[30]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[31]);
result_res = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[32]);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[33]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[34]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[35]);
result_res = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[36]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[37]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[38]);
result = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result, model.conv2d_layers[39]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[40]);
result_res = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[41]);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[42]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[43]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[44]);
result_res = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result_res, model.conv2d_layers[45]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[46]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[47]);
result = ggml_add(ctx0, result_res, result);
result = apply_conv2d(ctx0, result, model.conv2d_layers[48]);
result = apply_conv_depthwise_2d(ctx0, result, model.conv2d_layers[49]);
result = apply_conv2d_no_clamp(ctx0, result, model.conv2d_layers[50]);
result = apply_conv2d(ctx0, result, model.conv2d_layers[51]);
result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_AVG, 7, 7, 1, 1, 0, 0);
result = ggml_reshape_2d(ctx0, result, result->ne[2], result->ne[3]);
result = ggml_mul_mat(ctx0, model.gemm_layers[0].weights, result);
result = ggml_add(ctx0, result, model.gemm_layers[0].biases);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
ggml_build_forward_expand(gf, result);
const int64_t t_start_ms = ggml_time_ms();
ggml_graph_compute_with_ctx(ctx0, gf, 1);
const int64_t t_end_ms = ggml_time_ms();
std::cout << "ggml_graph_compute_with_ctx exec time(ms): " << t_end_ms-t_start_ms << std::endl;
float * output = ggml_get_data_f32(result);
return output;
}
Is there something wrong when I build the model? Do you have any suggestions? Thanks in advance.