int4量化模型,运行一段时间后报错
OS: ubuntu GPU: v100
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/opt/conda/envs/py38/lib/python3.8/site-packages/uvicorn/protocols/websockets/websockets_impl.py", line 254, in run_asgi
result = await self.app(self.scope, self.asgi_receive, self.asgi_send)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/uvicorn/middleware/proxy_headers.py", line 78, in call
return await self.app(scope, receive, send)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/fastapi/applications.py", line 276, in call
await super().call(scope, receive, send)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/applications.py", line 122, in call
await self.middleware_stack(scope, receive, send)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/middleware/errors.py", line 149, in call
await self.app(scope, receive, send)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/middleware/exceptions.py", line 79, in call
raise exc
File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/middleware/exceptions.py", line 68, in call
await self.app(scope, receive, sender)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/fastapi/middleware/asyncexitstack.py", line 21, in call
raise e
File "/opt/conda/envs/py38/lib/python3.8/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in call
await self.app(scope, receive, send)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/routing.py", line 718, in call
await route.handle(scope, receive, send)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/routing.py", line 341, in handle
await self.app(scope, receive, send)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/routing.py", line 82, in app
await func(session)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/fastapi/routing.py", line 289, in app
await dependant.call(**values)
File "/app/api.py", line 51, in stream_chat
for response in infer.stream_forward(messages, paras):
File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 43, in generator_context
response = gen.send(None)
File "/app/moss_inference_extend.py", line 18, in stream_forward
for output_ids in self.stream_sample(
File "/app/moss_inference_extend.py", line 62, in stream_sample
logits, past_key_values = self.infer_(input_ids if i == 0 else new_generated_id, attention_mask, past_key_values)
File "/app/moss_inference.py", line 338, in infer_
outputs: BaseModelOutputWithPast = self.model(**inputs)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/app/models/modeling_moss.py", line 678, in forward
transformer_outputs = self.transformer(
File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/app/models/modeling_moss.py", line 545, in forward
outputs = block(
File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/app/models/modeling_moss.py", line 270, in forward
attn_outputs = self.attn(
File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/app/models/modeling_moss.py", line 164, in forward
qkv = self.qkv_proj(hidden_states)
File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/app/models/quantization.py", line 367, in forward
out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.scales,
File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 105, in decorate_fwd
return fwd(*args, **kwargs)
File "/app/models/quantization.py", line 279, in forward
output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
File "/app/models/quantization.py", line 250, in matmul248
matmul_248_kernel[grid](input, qweight, output,
File "/app/models/custom_autotune.py", line 109, in run
return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
File "