TensorRT-LLM
TensorRT-LLM copied to clipboard
[bug] Offloading to host memory leads to error
I'm testing kv reuse feature
Everything works fine until i try to use offloading to host mem
I enable offloading by these lines
optionalParams.kvCacheConfig.hostCacheSize = 40000000000;
optionalParams.kvCacheConfig.onboardBlocks = true;
But when I tried to use the server I encountered an error
[TensorRT-LLM][ERROR] Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] Assertion failed: mFreePrimaryBlocks list has no GPU blocks (/home/jenkins/agent/workspace/LLM/main/L0_MergeRequest/llm/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp:340)
1 0x412b9e tensorrt_llm::common::throwRuntimeError(char const*, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 71
2 0x7fc0e2c2fae6 tensorrt_llm::batch_manager::kv_cache_manager::BlockManager::findBestGPUBlockToFree() + 582
3 0x7fc0e2c320e7 tensorrt_llm::batch_manager::kv_cache_manager::BlockManager::getFreeBlock() + 23
4 0x7fc0e2c33b4c tensorrt_llm::batch_manager::kv_cache_manager::BlockManager::loadOrAllocateBlocks(std::__cxx11::list<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const&, tensorrt_llm::batch_manager::kv_cache_manager::GenerationRequest&, int, int) + 572
5 0x7fc0e2c34107 tensorrt_llm::batch_manager::kv_cache_manager::BlockManager::addSequence(tensorrt_llm::batch_manager::kv_cache_manager::GenerationRequest&, int, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> const&) + 295
6 0x7fc0e2c34603 tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager::addSequence(int, int, int, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> const&) + 707
7 0x7fc0e2c51c7e tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::ssm_state_manager::SsmStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 3774
8 0x7fc0e2c536dd tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::ssm_state_manager::SsmStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 157
9 0x7fc0e2c5ea82 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::setupContext(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int) + 146
10 0x7fc0e2c5ed21 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 577
11 0x7fc0e2c68851 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2177
12 0x7fc0e2c1ae34 tensorrt_llm::batch_manager::GptManager::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&, std::unordered_set<unsigned long, std::hash<unsigned long>, std::equal_to<unsigned long>, std::allocator<unsigned long> >&) + 36
13 0x7fc0e2c21bb7 tensorrt_llm::batch_manager::GptManager::decoupled_execution_loop() + 215
I'm using version from 30 april and the GptManager backend