Baichuan-13B 提问它是不是baichuan-13B，回答却是baichuan-7B，请问是为啥？

1 下载模型

from huggingface_hub import snapshot_download
snapshot_download(repo_id="baichuan-inc/Baichuan-13B-Chat", cache_dir=".")

2 推理模型

from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers.generation.utils import GenerationConfig
import uvicorn, json, datetime
import torch
import os
from typing import Dict, Tuple, Union, Optional
from torch.nn import Module

# 自动设置device_map
def auto_configure_device_map(num_gpus: int):
    num_trans_layers = 40
    per_gpu_layers = num_trans_layers / num_gpus
    device_map = {'model.embed_tokens': 0,
    'model.norm': num_gpus-1, 'lm_head': num_gpus-1}
    for i in range(num_trans_layers):
        device_map[f'model.layers.{i}'] = int(i//per_gpu_layers)
    return device_map

# GPU数量
NUM_GPUS = torch.cuda.device_count() if torch.cuda.is_available() else None
# device_map
device_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS>0 else None
device = torch.device("cuda") if NUM_GPUS>0 else torch.device("cpu")
device_dtype = torch.half if NUM_GPUS>0 else torch.float

# 显存回收
def torch_gc():
    if torch.cuda.is_available():
        with torch.cuda.device(device):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

app = FastAPI()

@app.post("/")
async def create_item(request: Request):
    global model, tokenizer
    json_post_raw = await request.json()
    json_post = json.dumps(json_post_raw)
    json_post_list = json.loads(json_post)
    prompt = json_post_list.get('prompt')
    history = json_post_list.get('history')
    max_length = json_post_list.get('max_length')
    top_p = json_post_list.get('top_p')
    temperature = json_post_list.get('temperature')
    messages = []
    messages.append({"role": "user", "content": prompt})
    response = model.chat(tokenizer, messages)
    
    now = datetime.datetime.now()
    time = now.strftime("%Y-%m-%d %H:%M:%S")
    #response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
    answer = {
        "response": response,
        "status": 200,
        "time": time
    }
    log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'
    print(log)
    torch_gc()
    return answer


if __name__ == '__main__':
    model_dir = "./baichuan-inc--Baichuan-13B-Chat/snapshots/d0a98e13222c6e82d24062f60ff491519e249744"
    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, device_map=device_map, torch_dtype=torch.float16)
    model.generation_config = GenerationConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
    print(model)
    model.eval()
    uvicorn.run(app, host='0.0.0.0', port=8080, workers=1)

3 访问模型

curl --location 'http://localhost:8080' \
--header 'Content-Type: application/json' \
--data '{"prompt": "你是baichuan-13b吗？", "history": []}'
{"response":"不是，我是Baichuan-7B，一个人工智能程序，可以在多个任务中提供帮助，包括但不限于回答问题、提供建议、生成代码和解释算法。","status":200,"time":"2023-07-11 17:40:08"

4 服务日志

[2023-07-11 17:40:08] ", prompt:"你是baichuan-13b吗？", response:"'不是，我是Baichuan-7B，一个人工智能程序，可以在多个任务中提供帮助，包括但不限于回答问题、提供建议、生成代码和解释算法。'"
INFO:     xx.xx.xx.xx:32134 - "POST / HTTP/1.1" 200 OK