FastChat
FastChat copied to clipboard
How to enable batch evaluation? I got RuntimeError: CUDA error: device-side assert triggered
To accelerate evaluation, I want to generate with multiple prompts rather than only one prompt. But I got following CUDA error. Can someone help this?
error:
131 ../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [27,0,0], thread: [27,0,0] Assertion srcIndex < srcSelectDimSize failed.
132 ../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [27,0,0], thread: [28,0,0] Assertion srcIndex < srcSelectDimSize failed.
133 ../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [27,0,0], thread: [29,0,0] Assertion srcIndex < srcSelectDimSize failed.
134 ../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [27,0,0], thread: [30,0,0] Assertion srcIndex < srcSelectDimSize failed.
135 ../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [27,0,0], thread: [31,0,0] Assertion srcIndex < srcSelectDimSize failed.
179 File "/workdir/machi04/git/vicuna/fastchat/serve/api1.py", line 41, in create_item
180 output_ids = model.generate(
181 File "/usr/local/conda/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
182 return func(*args, **kwargs)
183 File "/home/hadoop-hmart-waimai-rank/.local/lib/python3.9/site-packages/transformers/generation/utils.py", line 1485, in generate
184 return self.sample(
185 File "/home/hadoop-hmart-waimai-rank/.local/lib/python3.9/site-packages/transformers/generation/utils.py", line 2524, in sample
186 outputs = self(
187 File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
188 return forward_call(*args, **kwargs)
189 File "/home/hadoop-hmart-waimai-rank/.local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 687, in forward
190 outputs = self.model(
191 File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
192 return forward_call(*args, **kwargs)
193 File "/home/hadoop-hmart-waimai-rank/.local/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 530, in forward
194 inputs_embeds = self.embed_tokens(input_ids)
195 File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
196 return forward_call(*args, **kwargs)
197 File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
198 return F.embedding(
199 File "/usr/local/conda/lib/python3.9/site-packages/torch/nn/functional.py", line 2210, in embedding
200 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
201 RuntimeError: CUDA error: device-side assert triggered
202 Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
code: 30 prompts = [] 31 for msg in msgs: 32 conv = get_conversation_template(model_path) 33 conv.append_message(conv.roles[0], msg) 34 conv.append_message(conv.roles[1], None) 35 prompt = conv.get_prompt() 36 prompts.append(prompt) 37 38 #input_ids = tokenizer(prompts).input_ids 39 input_ids = tokenizer(prompts, return_tensors="pt", padding=True)['input_ids'].to('cuda') 40 print('input_ids:', input_ids) 41 output_ids = model.generate( 42 input_ids, 43 do_sample=True, 44 temperature=0.7, 45 repetition_penalty=1.0, 46 max_new_tokens=512, 47 ) 48 if model.config.is_encoder_decoder: 49 output_ids = output_ids 50 else: 51 output_ids = list(map(lambda output_id, input_id: output_id[len(input_id) :], output_ids, input_ids)) 52 outputs = tokenizer.batch_decode( 53 output_ids, skip_special_tokens=True, spaces_between_special_tokens=False 54 )