chatglm.cpp 跑比较长文本推理报错

chatglm4-9b-1m模型批量读取文件推理，随机会报错

GGML_ASSERT: /home/lili/chatglm.cpp/third_party/ggml/src/ggml.c:3596: view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src) Aborted (core dumped)

Jul 30 '24 06:07 leizhu1989

可能是显存不足导致

Jul 30 '24 07:07 leizhu1989

不对，有随机性，感觉是有问题

Jul 30 '24 07:07 leizhu1989

6 . 注重题型多样化

多样化的练习能够帮助巩固 GGML_ASSERT: /home/lili/chatglm.cpp/third_party/ggml/src/ggml.c:3596: view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src) Aborted (core dumped)

看着像是生成完了出现的bug，跑了下原版的openai_api代码：

import asyncio import base64 import io import json import logging import time from typing import Dict, List, Literal, Optional, Union

import chatglm_cpp import uvicorn from fastapi import FastAPI, HTTPException, status from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field, computed_field from pydantic_settings import BaseSettings from sse_starlette.sse import EventSourceResponse

logging.basicConfig(level=logging.INFO, format=r"%(asctime)s - %(module)s - %(levelname)s - %(message)s")

class Settings(BaseSettings): model: str = "/home/lili/models/chatglm-ggml-int8-9b-1m.bin" max_length: int = 4096

class ToolCallFunction(BaseModel): arguments: str name: str

class ToolCall(BaseModel): function: Optional[ToolCallFunction] = None type: Literal["function"]

class ContentText(BaseModel): type: Literal["text"] = "text" text: str

class ContentImageUrlData(BaseModel): url: str detail: str = "high"

class ContentImageUrl(BaseModel): type: Literal["image_url"] = "image_url" image_url: ContentImageUrlData

class ChatMessage(BaseModel): role: Literal["system", "user", "assistant"] content: Union[str, List[Union[ContentText, ContentImageUrl]]] tool_calls: Optional[List[ToolCall]] = None

class DeltaMessage(BaseModel): role: Optional[Literal["system", "user", "assistant"]] = None content: Optional[str] = None tool_calls: Optional[List[ToolCall]] = None

class ChatCompletionToolFunction(BaseModel): description: Optional[str] = None name: str parameters: Dict

class ChatCompletionTool(BaseModel): type: Literal["function"] = "function" function: ChatCompletionToolFunction

class ChatCompletionRequest(BaseModel): model: str = "default-model" messages: List[ChatMessage] temperature: float = Field(default=0.95, ge=0.0, le=2.0) top_p: float = Field(default=0.7, ge=0.0, le=1.0) top_k: int = Field(default=2, ge=3) stream: bool = False max_tokens: int = Field(default=2048, ge=0) tools: Optional[List[ChatCompletionTool]] = None repeat_penalty: float = Field(default=1.0, ge=0.0, le=2.0) model_config = { "json_schema_extra": { "examples": [{"model": "default-model", "messages": [{"role": "user", "content": "你好"}]}] } }

class ChatCompletionResponseChoice(BaseModel): index: int = 0 message: ChatMessage finish_reason: Literal["stop", "length", "function_call"]

class ChatCompletionResponseStreamChoice(BaseModel): index: int = 0 delta: DeltaMessage finish_reason: Optional[Literal["stop", "length"]] = None

class ChatCompletionUsage(BaseModel): prompt_tokens: int completion_tokens: int

@computed_field
@property
def total_tokens(self) -> int:
    return self.prompt_tokens + self.completion_tokens

class ChatCompletionResponse(BaseModel): id: str = "chatcmpl" model: str = "default-model" object: Literal["chat.completion", "chat.completion.chunk"] created: int = Field(default_factory=lambda: int(time.time())) choices: Union[List[ChatCompletionResponseChoice], List[ChatCompletionResponseStreamChoice]] usage: Optional[ChatCompletionUsage] = None

model_config = {
    "json_schema_extra": {
        "examples": [
            {
                "id": "chatcmpl",
                "model": "default-model",
                "object": "chat.completion",
                "created": 1691166146,
                "choices": [
                    {
                        "index": 0,
                        "message": {
                            "role": "assistant",
                            "content": "你好👋！我是人工智能助手 ChatGLM2-6B，很高兴见到你，欢迎问我任何问题。",
                        },
                        "finish_reason": "stop",
                    }
                ],
                "usage": {"prompt_tokens": 17, "completion_tokens": 29, "total_tokens": 46},
            }
        ]
    }
}

settings = Settings() app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=[""], allow_credentials=True, allow_methods=[""], allow_headers=["*"] ) pipeline = chatglm_cpp.Pipeline(settings.model, max_length=settings.max_length) lock = asyncio.Lock()

def stream_chat(messages, body): yield ChatCompletionResponse( object="chat.completion.chunk", choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(role="assistant"))], )

for chunk in pipeline.chat(
messages=messages,
max_length=4500,
max_new_tokens=2000,
max_context_length=2500,
do_sample=0.95,
top_p=body.top_p,
top_k=body.top_k,
temperature=body.temperature,
repetition_penalty=body.repeat_penalty,
stream=True,
):
    yield ChatCompletionResponse(
        object="chat.completion.chunk",
        choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(content=chunk.content))],
    )

yield ChatCompletionResponse(
    object="chat.completion.chunk",
    choices=[ChatCompletionResponseStreamChoice(delta=DeltaMessage(), finish_reason="stop")],
)

async def stream_chat_event_publisher(history, body): output = "" try: async with lock: for chunk in stream_chat(history, body): await asyncio.sleep(0) # yield control back to event loop for cancellation check output += chunk.choices[0].delta.content or "" print(chunk.choices[0].delta.content) yield chunk.model_dump_json(exclude_unset=True) logging.info(f'prompt: "{history[-1]}", stream response: "{output}"') except asyncio.CancelledError as e: logging.info(f'prompt: "{history[-1]}", stream response (partial): "{output}"') raise e

@app.post("/v1/chat/completions") async def create_chat_completion(body: ChatCompletionRequest) -> ChatCompletionResponse: def to_json_arguments(arguments): def tool_call(**kwargs): return kwargs

    try:
        return json.dumps(eval(arguments, dict(tool_call=tool_call)))
    except Exception:
        return arguments

if not body.messages:
    raise HTTPException(status.HTTP_400_BAD_REQUEST, "empty messages")

messages = []
for msg in body.messages:
    if isinstance(msg.content, str):
        msg.content = msg.content[:3000]
        messages.append(chatglm_cpp.ChatMessage(role=msg.role, content=msg.content))
    # else:
    #     if not (len(msg.content) == 2 and msg.content[0].type == "text" and msg.content[1].type == "image_url"):
    #         raise HTTPException(
    #             status.HTTP_400_BAD_REQUEST,
    #             "multimodal content must have a text item followed by an image_url item",
    #         )

    #     import numpy as np
    #     from PIL import Image

    #     text = msg.content[0].text
    #     image_url = msg.content[1].image_url.url
    #     if image_url.startswith("data:"):
    #         image_bytes = base64.b64decode(image_url.split(",")[1])
    #     else:
    #         import requests

    #         image_bytes = requests.get(image_url).content
    #     image = chatglm_cpp.Image(np.asarray(Image.open(io.BytesIO(image_bytes))))

    #     messages.append(chatglm_cpp.ChatMessage(role=msg.role, content=text, image=image))

if body.tools:
    system_content = (
        "Answer the following questions as best as you can. You have access to the following tools:\n"
        + json.dumps([tool.model_dump() for tool in body.tools], indent=4)
    )
    messages.insert(0, chatglm_cpp.ChatMessage(role="system", content=system_content))

if body.stream:
    generator = stream_chat_event_publisher(messages, body)
    return EventSourceResponse(generator)

max_context_length = 2500
output = pipeline.chat(
    messages=messages,
    max_length=4500,
    max_new_tokens=2000,
    max_context_length=2500,
    do_sample=0.95,
    top_p=body.top_p,
    top_k=body.top_k,
    temperature=body.temperature,
    repetition_penalty=body.repeat_penalty,
)
logging.info(f'prompt: "{messages[-1].content}", sync response: "{output.content}"')
prompt_tokens = len(pipeline.tokenizer.apply_chat_template(messages, max_context_length))
completion_tokens = len(pipeline.tokenizer.encode(output.content, body.max_tokens))

finish_reason = "stop"
tool_calls = None
if output.tool_calls:
    tool_calls = [
        ToolCall(
            type=tool_call.type,
            function=ToolCallFunction(
                name=tool_call.function.name, arguments=to_json_arguments(tool_call.function.arguments)
            ),
        )
        for tool_call in output.tool_calls
    ]
    finish_reason = "function_call"

return ChatCompletionResponse(
    object="chat.completion",
    choices=[
        ChatCompletionResponseChoice(
            message=ChatMessage(role="assistant", content=output.content, tool_calls=tool_calls),
            finish_reason=finish_reason,
        )
    ],
    usage=ChatCompletionUsage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens),
)

class ModelCard(BaseModel): id: str object: Literal["model"] = "model" owned_by: str = "owner" permission: List = []

class ModelList(BaseModel): object: Literal["list"] = "list" data: List[ModelCard] = []

model_config = {
    "json_schema_extra": {
        "examples": [
            {
                "object": "list",
                "data": [{"id": "gpt-3.5-turbo", "object": "model", "owned_by": "owner", "permission": []}],
            }
        ]
    }
}

@app.get("/v1/models") async def list_models() -> ModelList: return ModelList(data=[ModelCard(id="gpt-3.5-turbo")])

if name == "main": uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)

Jul 30 '24 07:07 leizhu1989

应该是bug，可以提供下最小复现的例子吗

Jul 31 '24 04:07 li-plus