trt-llm-rag-windows update tensor-llm v0.9 with latest api

update tensor-llm v0.9 with the latest api （ModelRunner/ModelRunnerCpp)

Have tested successfully on Linux Docker for Llama-2-13b-chat-hf

Feb 28 '24 02:02 engineer1109

This is cool, but the branch you did was an unstable one for dev. think rel branch is 0.8.0 is the stable one :) the naming of the branches is a bit off if you ask me.

Apr 04 '24 23:04 raymondbernard

[TensorRT-LLM] TensorRT-LLM version: 0.9.0 9.3.0.post12.dev1 Uses your commits.

Apr 26 '24 04:04 suede299

@suede299 see your json/yaml config， is the structure changed？

Apr 26 '24 04:04 engineer1109

@suede299 see your json/yaml config， is the structure changed？

Is it the .json under RAG\trt-llm-rag-windows-main\config? Maybe I don't know how to modify it

Went back through the TensorRT-LLM docs and couldn't find anything config related.

Apr 26 '24 04:04 suede299

@suede299 The NV likes change the config file of the trt engine. When you generate the trt engine files, you will get a file of config.json and rank0.engine. Like

{
    "version": "0.9.0.dev2024022000",
    "pretrained_config": {
        "architecture": "LlamaForCausalLM",
        "dtype": "float16",
        "logits_dtype": "float32",
        "vocab_size": 32000,
        "max_position_embeddings": 4096,
        "hidden_size": 5120,
        "num_hidden_layers": 40,
        "num_attention_heads": 40,
        "num_key_value_heads": 40,
        "head_size": 128,
        "hidden_act": "silu",
        "intermediate_size": 13824,
        "norm_epsilon": 1e-05,
        "position_embedding_type": "rope_gpt_neox",
        "use_prompt_tuning": false,
        "use_parallel_embedding": false,
        "embedding_sharding_dim": 0,
        "share_embedding_table": false,
        "mapping": {
            "world_size": 1,
            "tp_size": 1,
            "pp_size": 1
        },
        "kv_dtype": "float16",
        "max_lora_rank": 64,
        "rotary_base": 10000.0,
        "rotary_scaling": null,
        "moe_num_experts": 0,
        "moe_top_k": 0,
        "moe_tp_mode": 2,
        "moe_normalization_mode": 1,
        "enable_pos_shift": false,
        "dense_context_fmha": false,
        "lora_target_modules": null,
        "hf_modules_to_trtllm_modules": {
            "q_proj": "attn_q",
            "k_proj": "attn_k",
            "v_proj": "attn_v",
            "o_proj": "attn_dense",
            "gate_proj": "mlp_h_to_4h",
            "down_proj": "mlp_4h_to_h",
            "up_proj": "mlp_gate"
        },
        "trtllm_modules_to_hf_modules": {
            "attn_q": "q_proj",
            "attn_k": "k_proj",
            "attn_v": "v_proj",
            "attn_dense": "o_proj",
            "mlp_h_to_4h": "gate_proj",
            "mlp_4h_to_h": "down_proj",
            "mlp_gate": "up_proj"
        },
        "disable_weight_only_quant_plugin": false,
        "mlp_bias": false,
        "attn_bias": false,
        "quantization": {
            "quant_algo": "W8A16",
            "kv_cache_quant_algo": null,
            "group_size": 128,
            "has_zero_point": false,
            "pre_quant_scale": false,
            "exclude_modules": null,
            "sq_use_plugin": false
        }
    },
    "build_config": {
        "max_input_len": 4096,
        "max_output_len": 1024,
        "max_batch_size": 1,
        "max_beam_width": 1,
        "max_num_tokens": 4096,
        "max_prompt_embedding_table_size": 0,
        "gather_context_logits": false,
        "gather_generation_logits": false,
        "strongly_typed": false,
        "builder_opt": null,
        "profiling_verbosity": "layer_names_only",
        "enable_debug_output": false,
        "max_draft_len": 0,
        "plugin_config": {
            "bert_attention_plugin": "float16",
            "gpt_attention_plugin": "float16",
            "gemm_plugin": "float16",
            "smooth_quant_gemm_plugin": null,
            "identity_plugin": null,
            "layernorm_quantization_plugin": null,
            "rmsnorm_quantization_plugin": null,
            "nccl_plugin": null,
            "lookup_plugin": null,
            "lora_plugin": null,
            "weight_only_groupwise_quant_matmul_plugin": null,
            "weight_only_quant_matmul_plugin": "float16",
            "quantize_per_token_plugin": false,
            "quantize_tensor_plugin": false,
            "moe_plugin": "float16",
            "context_fmha": true,
            "context_fmha_fp32_acc": false,
            "paged_kv_cache": true,
            "remove_input_padding": true,
            "use_custom_all_reduce": true,
            "multi_block_mode": false,
            "enable_xqa": true,
            "attention_qk_half_accumulation": false,
            "tokens_per_block": 128,
            "use_paged_context_fmha": false,
            "use_context_fmha_for_generation": false
        }
    }
}

Apr 26 '24 07:04 engineer1109

@suede299 This is why the version compat is very hard. The NV Group loves to change the file config format everyday.

Apr 26 '24 07:04 engineer1109

@suede299 The NV likes change the config file of the trt engine. When you generate the trt engine files, you will get a file of config.json and rank0.engine. Like

{
    "version": "0.9.0.dev2024022000",
    "pretrained_config": {
        "architecture": "LlamaForCausalLM",
        "dtype": "float16",
        "logits_dtype": "float32",
        "vocab_size": 32000,
        "max_position_embeddings": 4096,
        "hidden_size": 5120,
        "num_hidden_layers": 40,
        "num_attention_heads": 40,
        "num_key_value_heads": 40,
        "head_size": 128,
        "hidden_act": "silu",
        "intermediate_size": 13824,
        "norm_epsilon": 1e-05,
        "position_embedding_type": "rope_gpt_neox",
        "use_prompt_tuning": false,
        "use_parallel_embedding": false,
        "embedding_sharding_dim": 0,
        "share_embedding_table": false,
        "mapping": {
            "world_size": 1,
            "tp_size": 1,
            "pp_size": 1
        },
        "kv_dtype": "float16",
        "max_lora_rank": 64,
        "rotary_base": 10000.0,
        "rotary_scaling": null,
        "moe_num_experts": 0,
        "moe_top_k": 0,
        "moe_tp_mode": 2,
        "moe_normalization_mode": 1,
        "enable_pos_shift": false,
        "dense_context_fmha": false,
        "lora_target_modules": null,
        "hf_modules_to_trtllm_modules": {
            "q_proj": "attn_q",
            "k_proj": "attn_k",
            "v_proj": "attn_v",
            "o_proj": "attn_dense",
            "gate_proj": "mlp_h_to_4h",
            "down_proj": "mlp_4h_to_h",
            "up_proj": "mlp_gate"
        },
        "trtllm_modules_to_hf_modules": {
            "attn_q": "q_proj",
            "attn_k": "k_proj",
            "attn_v": "v_proj",
            "attn_dense": "o_proj",
            "mlp_h_to_4h": "gate_proj",
            "mlp_4h_to_h": "down_proj",
            "mlp_gate": "up_proj"
        },
        "disable_weight_only_quant_plugin": false,
        "mlp_bias": false,
        "attn_bias": false,
        "quantization": {
            "quant_algo": "W8A16",
            "kv_cache_quant_algo": null,
            "group_size": 128,
            "has_zero_point": false,
            "pre_quant_scale": false,
            "exclude_modules": null,
            "sq_use_plugin": false
        }
    },
    "build_config": {
        "max_input_len": 4096,
        "max_output_len": 1024,
        "max_batch_size": 1,
        "max_beam_width": 1,
        "max_num_tokens": 4096,
        "max_prompt_embedding_table_size": 0,
        "gather_context_logits": false,
        "gather_generation_logits": false,
        "strongly_typed": false,
        "builder_opt": null,
        "profiling_verbosity": "layer_names_only",
        "enable_debug_output": false,
        "max_draft_len": 0,
        "plugin_config": {
            "bert_attention_plugin": "float16",
            "gpt_attention_plugin": "float16",
            "gemm_plugin": "float16",
            "smooth_quant_gemm_plugin": null,
            "identity_plugin": null,
            "layernorm_quantization_plugin": null,
            "rmsnorm_quantization_plugin": null,
            "nccl_plugin": null,
            "lookup_plugin": null,
            "lora_plugin": null,
            "weight_only_groupwise_quant_matmul_plugin": null,
            "weight_only_quant_matmul_plugin": "float16",
            "quantize_per_token_plugin": false,
            "quantize_tensor_plugin": false,
            "moe_plugin": "float16",
            "context_fmha": true,
            "context_fmha_fp32_acc": false,
            "paged_kv_cache": true,
            "remove_input_padding": true,
            "use_custom_all_reduce": true,
            "multi_block_mode": false,
            "enable_xqa": true,
            "attention_qk_half_accumulation": false,
            "tokens_per_block": 128,
            "use_paged_context_fmha": false,
            "use_context_fmha_for_generation": false
        }
    }
}

Thanks for the reply. I changed the config.json myself as per the error. but still ended up getting this error. [ERROR] 6: The engine plan file is not compatible with this version of TensorRT, expecting library version 9.3.0.1 got 9.2.0.5, please rebuild. Decided to rebuild the engine file and it turned out to be too difficult for me. This thing is so unfriendly to the Win platform.

Apr 26 '24 12:04 suede299

@suede299 TensorRT is better for docker server, or edge device, not suitable for consumer clients. The tensorrt model need to keep the same version with TensorRT Library. Every version of model format is different, even with 9.2.0 and 9.2.1. The tensorrt will check the engine file header of magic number to check the engine generated version.

Sometimes, the engine need to regenerate when your card or driver sdk is changed. The engine file is very unstable, and need the environment that hardware and software are not changed.

Maybe consumers will more like to compat old versions. However, tensorrt group likes mutable every version.

Apr 26 '24 14:04 engineer1109

@suede299 TensorRT is better for docker server, or edge device, not suitable for consumer clients. The tensorrt model need to keep the same version with TensorRT Library. Every version of model format is different, even with 9.2.0 and 9.2.1. The tensorrt will check the engine file header of magic number to check the engine generated version.

Sometimes, the engine need to regenerate when your card or driver sdk is changed. The engine file is very unstable, and need the environment that hardware and software are not changed.

Maybe consumers will more like to compat old versions. However, tensorrt group likes mutable every version.

Yes, I gave up, quantizing the gemma model would be wrong, found a change to the gemma script on github, tried to update it, but it asked for version 0.10dev, and there was no whl available for the win platform at all. Thank you again.

Apr 27 '24 07:04 suede299