System Info
- trt_llm 0.11.0.dev2024051400
- trt 10.0.1
- device A100
- coda for Tensorrt_llm: latest version in main branch
Who can help?
@Tracin
Information
- [X] The official example scripts
- [ ] My own modified scripts
Tasks
- [X] An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...)
- [ ] My own task or dataset (give details below)
Reproduction
Use any Llama3 weight form HF: (Such as Llama3-ChatQA-1.5-8B)
python3 convert_checkpoint.py --model_dir ${model_dir} --output_dir ${output_dir} --dtype float16 --use_weight_only --weight_only_precision int8 --tp_size 1
trtllm_build --checkpoint_dir ${checkpoint_dir} --gemm_plugin float16 --output_dir ${output_dir} --paged_kv_cache --weight_only_precision int8
Expected behavior
convert checkpoint and build successfully
actual behavior
Traceback (most recent call last):
File "/workspace6/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 456, in
main()
File "/workspace6/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 448, in main
convert_and_save_hf(args)
File "/workspace6/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 384, in convert_and_save_hf
execute(args.workers, [convert_and_save_rank] * world_size, args)
File "/workspace6/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 407, in execute
f(args, rank)
File "/workspace6/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 371, in convert_and_save_rank
llama = LLaMAForCausalLM.from_hugging_face(
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/models/llama/model.py", line 280, in from_hugging_face
llama = convert.from_hugging_face(
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/models/llama/convert.py", line 1337, in from_hugging_face
llama.load(weights)
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/models/modeling_utils.py", line 419, in load
raise RuntimeError(
RuntimeError: Expected but not provided tensors:{'transformer.layers.17.attention.qkv.per_channel_scale', 'transformer.layers.7.attention.qkv.per_channel_scale', 'transformer.layers.28.attention.qkv.per_channel_scale', 'transformer.layers.25.mlp.gate.per_channel_scale', 'transformer.layers.8.attention.dense.per_channel_scale', 'transformer.layers.14.mlp.gate.per_channel_scale', 'transformer.layers.20.mlp.gate.per_channel_scale', 'transformer.layers.16.mlp.fc.per_channel_scale', 'transformer.layers.8.mlp.fc.per_channel_scale', 'transformer.layers.29.attention.dense.per_channel_scale', 'transformer.layers.22.mlp.fc.per_channel_scale', 'transformer.layers.24.mlp.proj.per_channel_scale', 'transformer.layers.17.mlp.gate.per_channel_scale', 'transformer.layers.4.attention.qkv.per_channel_scale', 'transformer.layers.4.mlp.fc.per_channel_scale', 'transformer.layers.25.mlp.fc.per_channel_scale', 'transformer.layers.27.mlp.gate.per_channel_scale', 'transformer.layers.31.mlp.gate.per_channel_scale', 'transformer.layers.19.mlp.proj.per_channel_scale', 'transformer.layers.7.mlp.proj.per_channel_scale', 'transformer.layers.18.mlp.fc.per_channel_scale', 'transformer.layers.1.mlp.gate.per_channel_scale', 'transformer.layers.24.attention.dense.per_channel_scale', 'transformer.layers.9.attention.qkv.per_channel_scale', 'transformer.layers.12.attention.dense.per_channel_scale', 'transformer.layers.11.mlp.proj.per_channel_scale', 'transformer.layers.9.mlp.proj.per_channel_scale', 'transformer.layers.1.mlp.fc.per_channel_scale', 'transformer.layers.7.mlp.fc.per_channel_scale', 'transformer.layers.11.mlp.fc.per_channel_scale', 'transformer.layers.2.attention.qkv.per_channel_scale', 'transformer.layers.9.mlp.fc.per_channel_scale', 'transformer.layers.28.mlp.fc.per_channel_scale', 'transformer.layers.15.attention.dense.per_channel_scale', 'transformer.layers.15.mlp.proj.per_channel_scale', 'transformer.layers.28.attention.dense.per_channel_scale', 'transformer.layers.6.mlp.gate.per_channel_scale', 'transformer.layers.10.mlp.proj.per_channel_scale', 'transformer.layers.22.mlp.proj.per_channel_scale', 'transformer.layers.29.attention.qkv.per_channel_scale', 'transformer.layers.30.attention.dense.per_channel_scale', 'transformer.layers.24.mlp.fc.per_channel_scale', 'transformer.layers.31.attention.dense.per_channel_scale', 'transformer.layers.5.mlp.proj.per_channel_scale', 'transformer.layers.30.attention.qkv.per_channel_scale', 'transformer.layers.21.mlp.gate.per_channel_scale', 'transformer.layers.26.mlp.gate.per_channel_scale', 'transformer.layers.25.attention.dense.per_channel_scale', 'transformer.layers.16.attention.qkv.per_channel_scale', 'transformer.layers.6.attention.dense.per_channel_scale', 'transformer.layers.17.mlp.fc.per_channel_scale', 'transformer.layers.18.attention.qkv.per_channel_scale', 'transformer.layers.3.mlp.fc.per_channel_scale', 'transformer.layers.7.mlp.gate.per_channel_scale', 'transformer.layers.27.mlp.proj.per_channel_scale', 'transformer.layers.14.mlp.fc.per_channel_scale', 'transformer.layers.0.mlp.proj.per_channel_scale', 'transformer.layers.12.mlp.fc.per_channel_scale', 'transformer.layers.29.mlp.proj.per_channel_scale', 'transformer.layers.26.mlp.proj.per_channel_scale', 'transformer.layers.10.attention.dense.per_channel_scale', 'transformer.layers.30.mlp.fc.per_channel_scale', 'transformer.layers.23.attention.qkv.per_channel_scale', 'transformer.layers.31.mlp.proj.per_channel_scale', 'transformer.layers.2.mlp.gate.per_channel_scale', 'transformer.layers.3.mlp.proj.per_channel_scale', 'transformer.layers.15.attention.qkv.per_channel_scale', 'transformer.layers.2.attention.dense.per_channel_scale', 'transformer.layers.8.mlp.proj.per_channel_scale', 'transformer.layers.21.mlp.fc.per_channel_scale', 'transformer.layers.18.mlp.proj.per_channel_scale', 'transformer.layers.17.mlp.proj.per_channel_scale', 'transformer.layers.24.attention.qkv.per_channel_scale', 'transformer.layers.1.attention.qkv.per_channel_scale', 'transformer.layers.31.mlp.fc.per_channel_scale', 'transformer.layers.21.attention.dense.per_channel_scale', 'transformer.layers.12.mlp.gate.per_channel_scale', 'transformer.layers.1.mlp.proj.per_channel_scale', 'transformer.layers.3.attention.qkv.per_channel_scale', 'transformer.layers.8.attention.qkv.per_channel_scale', 'transformer.layers.12.mlp.proj.per_channel_scale', 'transformer.layers.15.mlp.fc.per_channel_scale', 'transformer.layers.29.mlp.fc.per_channel_scale', 'transformer.layers.20.attention.qkv.per_channel_scale', 'transformer.layers.19.attention.qkv.per_channel_scale', 'transformer.layers.19.mlp.fc.per_channel_scale', 'transformer.layers.4.mlp.proj.per_channel_scale', 'transformer.layers.9.mlp.gate.per_channel_scale', 'transformer.layers.6.mlp.fc.per_channel_scale', 'transformer.layers.27.attention.dense.per_channel_scale', 'transformer.layers.12.attention.qkv.per_channel_scale', 'transformer.layers.26.attention.qkv.per_channel_scale', 'transformer.layers.19.mlp.gate.per_channel_scale', 'transformer.layers.0.attention.qkv.per_channel_scale', 'transformer.layers.28.mlp.proj.per_channel_scale', 'transformer.layers.25.mlp.proj.per_channel_scale', 'transformer.layers.23.mlp.gate.per_channel_scale', 'transformer.layers.20.attention.dense.per_channel_scale', 'transformer.layers.11.mlp.gate.per_channel_scale', 'transformer.layers.21.attention.qkv.per_channel_scale', 'transformer.layers.13.attention.dense.per_channel_scale', 'transformer.layers.30.mlp.proj.per_channel_scale', 'transformer.layers.4.mlp.gate.per_channel_scale', 'transformer.layers.13.attention.qkv.per_channel_scale', 'transformer.layers.24.mlp.gate.per_channel_scale', 'transformer.layers.10.mlp.gate.per_channel_scale', 'transformer.layers.28.mlp.gate.per_channel_scale', 'transformer.layers.5.mlp.gate.per_channel_scale', 'transformer.layers.6.attention.qkv.per_channel_scale', 'transformer.layers.14.attention.qkv.per_channel_scale', 'transformer.layers.13.mlp.proj.per_channel_scale', 'transformer.layers.0.mlp.fc.per_channel_scale', 'transformer.layers.3.attention.dense.per_channel_scale', 'transformer.layers.10.attention.qkv.per_channel_scale', 'transformer.layers.5.mlp.fc.per_channel_scale', 'transformer.layers.3.mlp.gate.per_channel_scale', 'transformer.layers.11.attention.dense.per_channel_scale', 'transformer.layers.22.mlp.gate.per_channel_scale', 'transformer.layers.5.attention.qkv.per_channel_scale', 'transformer.layers.23.mlp.fc.per_channel_scale', 'transformer.layers.31.attention.qkv.per_channel_scale', 'transformer.layers.9.attention.dense.per_channel_scale', 'transformer.layers.2.mlp.fc.per_channel_scale', 'transformer.layers.4.attention.dense.per_channel_scale', 'transformer.layers.30.mlp.gate.per_channel_scale', 'transformer.layers.0.attention.dense.per_channel_scale', 'transformer.layers.14.attention.dense.per_channel_scale', 'transformer.layers.14.mlp.proj.per_channel_scale', 'transformer.layers.18.mlp.gate.per_channel_scale', 'transformer.layers.16.mlp.gate.per_channel_scale', 'transformer.layers.18.attention.dense.per_channel_scale', 'transformer.layers.16.attention.dense.per_channel_scale', 'transformer.layers.22.attention.qkv.per_channel_scale', 'transformer.layers.25.attention.qkv.per_channel_scale', 'transformer.layers.20.mlp.proj.per_channel_scale', 'transformer.layers.2.mlp.proj.per_channel_scale', 'transformer.layers.20.mlp.fc.per_channel_scale', 'transformer.layers.10.mlp.fc.per_channel_scale', 'transformer.layers.23.mlp.proj.per_channel_scale', 'transformer.layers.15.mlp.gate.per_channel_scale', 'transformer.layers.0.mlp.gate.per_channel_scale', 'transformer.layers.5.attention.dense.per_channel_scale', 'transformer.layers.27.attention.qkv.per_channel_scale', 'transformer.layers.29.mlp.gate.per_channel_scale', 'transformer.layers.17.attention.dense.per_channel_scale', 'transformer.layers.23.attention.dense.per_channel_scale', 'transformer.layers.1.attention.dense.per_channel_scale', 'transformer.layers.16.mlp.proj.per_channel_scale', 'transformer.layers.7.attention.dense.per_channel_scale', 'transformer.layers.27.mlp.fc.per_channel_scale', 'transformer.layers.26.attention.dense.per_channel_scale', 'transformer.layers.22.attention.dense.per_channel_scale', 'transformer.layers.6.mlp.proj.per_channel_scale', 'transformer.layers.11.attention.qkv.per_channel_scale', 'transformer.layers.19.attention.dense.per_channel_scale', 'transformer.layers.26.mlp.fc.per_channel_scale', 'transformer.layers.21.mlp.proj.per_channel_scale', 'transformer.layers.13.mlp.gate.per_channel_scale', 'transformer.layers.13.mlp.fc.per_channel_scale', 'transformer.layers.8.mlp.gate.per_channel_scale'}
Exception ignored in: <function PretrainedModel.del at 0x7f4fbcf9dea0>
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/models/modeling_utils.py", line 377, in del
self.release()
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/models/modeling_utils.py", line 374, in release
release_gc()
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/_utils.py", line 443, in release_gc
torch.cuda.ipc_collect()
File "/usr/local/lib/python3.10/dist-packages/torch/cuda/init.py", line 813, in ipc_collect
_lazy_init()
File "/usr/local/lib/python3.10/dist-packages/torch/cuda/init.py", line 321, in _lazy_init
raise DeferredCudaCallError(msg) from e
torch.cuda.DeferredCudaCallError: CUDA call failed lazily at initialization with error: 'NoneType' object is not iterable
CUDA call was originally invoked at:
File "/workspace6/gukaidong/workspace/trt_llm_0519/TensorRT-LLM/examples/llama/convert_checkpoint.py", line 9, in
import tensorrt_llm
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/init.py", line 32, in
import tensorrt_llm.functional as functional
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/functional.py", line 28, in
from . import graph_rewriting as gw
File "", line 1078, in _handle_fromlist
File "", line 241, in _call_with_frames_removed
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/graph_rewriting.py", line 12, in
from .network import Network
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/network.py", line 26, in
from tensorrt_llm.module import Module
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/module.py", line 17, in
from ._common import default_net
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/_common.py", line 26, in
import torch
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/torch/init.py", line 1427, in
_C._initExtension(manager_path())
File "", line 1027, in _find_and_load
File "", line 1006, in _find_and_load_unlocked
File "", line 688, in _load_unlocked
File "", line 883, in exec_module
File "", line 241, in _call_with_frames_removed
File "/usr/local/lib/python3.10/dist-packages/torch/cuda/init.py", line 1303, in
_lazy_call(_register_triton_kernels)
File "/usr/local/lib/python3.10/dist-packages/torch/cuda/init.py", line 244, in _lazy_call
_queued_calls.append((callable, traceback.format_stack()))
additional notes
None