GLM-4
GLM-4 copied to clipboard
GLM-4-Z1-Rumination-32B-0414模型跑function call demo code卡住
跑demo code时候会卡住,调用get_assistant()时候一直没有输出,稳定复现 显存占用是正常的,GPU也是有负载的,但就是没有输出
运行环境: GPU: L20 * 4 CUDA: 12.4 python: 3.10 transformers: 4.51.3 pytorch: 2.6.0
demo code:
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import json
MODEL_PATH = "THUDM/GLM-4-Z1-Rumination-32B-0414"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto")
messages = [{"role": "user", "content": "Let a, b be positive real numbers such that ab = a + b + 3. Determine the range of possible values for a + b."}]
generate_kwargs = {
"temperature": 0.95,
"top_p": 0.7,
"do_sample": True,
"max_new_tokens": 16384
}
def get_assistant():
inputs = tokenizer.apply_chat_template(
messages,
return_tensors="pt",
add_generation_prompt=True,
return_dict=True,
).to(model.device)
out = model.generate(input_ids=inputs["input_ids"], **generate_kwargs)
return tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
def get_observation(function_name, args):
content = None
if function_name == "search":
mock_search_res = [
{"title": "t1", "url":"url1", "snippet": "snippet_content_1"},
{"title": "t2", "url":"url2", "snippet": "snippet_content_2"}
]
content = "\n\n".join([f"【{i}†{res['title']}†{res['url']}\n{res['snippet']}】"] for i, res in enumerate(mock_search_res))
elif function_name == "click":
mock_click_res = "main content"
content = mock_click_res
elif function_name == "open":
mock_open_res = "main_content"
content = mock_open_res
else:
raise ValueError("unspport function name!")
return content
def get_func_name_args(llm_text):
function_call = re.sub(r'.*?', '', llm_text, flags=re.DOTALL)
function_call = json.loads(function_call)
action = function_call['name']
params = function_call['arguments']
return action, params
def pipeline():
end_str = "{\"name\": \"finish\", \"arguments\": {}}"
response = get_assistant()
messages.append({"role": "assistant", "content": response})
max_turns, turns = 35, 1
while not response.endswith(end_str) and turns
尝试在载入模型时
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda",torch_dtype=torch.bfloat16)
改成这样
@zRzRzRzRzRzRzR 不行,device_map="cuda"会都加载到一块gpu上,显存不够会溢出,您之前在多卡上测试过吗,还是Rumination这个模型现在必须单卡才行
哦,auto也行,正常的,torch_dtype=torch.bfloat16才是关键
我换了一个环境测试也不行,在4090 * 4启动这个模型也会卡住没有输出
python环境各package版本:
Package Version ------------------------ ---------- accelerate 1.6.0 certifi 2025.1.31 charset-normalizer 3.4.1 filelock 3.18.0 fsspec 2025.3.2 huggingface-hub 0.30.2 idna 3.10 Jinja2 3.1.6 MarkupSafe 3.0.2 modelscope 1.25.0 mpmath 1.3.0 networkx 3.4.2 numpy 2.2.4 nvidia-cublas-cu12 12.4.5.8 nvidia-cuda-cupti-cu12 12.4.127 nvidia-cuda-nvrtc-cu12 12.4.127 nvidia-cuda-runtime-cu12 12.4.127 nvidia-cudnn-cu12 9.1.0.70 nvidia-cufft-cu12 11.2.1.3 nvidia-curand-cu12 10.3.5.147 nvidia-cusolver-cu12 11.6.1.9 nvidia-cusparse-cu12 12.3.1.170 nvidia-cusparselt-cu12 0.6.2 nvidia-nccl-cu12 2.21.5 nvidia-nvjitlink-cu12 12.4.127 nvidia-nvtx-cu12 12.4.127 packaging 24.2 pip 25.0 psutil 7.0.0 PyYAML 6.0.2 regex 2024.11.6 requests 2.32.3 safetensors 0.5.3 setuptools 75.8.0 sympy 1.13.1 tokenizers 0.21.1 torch 2.6.0 tqdm 4.67.1 transformers 4.51.3 triton 3.2.0 typing_extensions 4.13.2 urllib3 2.4.0 wheel 0.45.1
修改后的demo代码:
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import json
import torch
MODEL_PATH = "/data/llm_models/ZhipuAI/GLM-Z1-Rumination-32B-0414"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto",torch_dtype=torch.bfloat16)
messages = [{"role": "user", "content": "Let a, b be positive real numbers such that ab = a + b + 3. Determine the range of possible values for a + b."}]
generate_kwargs = {
"temperature": 0.95,
"top_p": 0.7,
"do_sample": True,
"max_new_tokens": 16384
}
def get_assistant():
inputs = tokenizer.apply_chat_template(
messages,
return_tensors="pt",
add_generation_prompt=True,
return_dict=True,
).to(model.device)
out = model.generate(input_ids=inputs["input_ids"], **generate_kwargs)
return tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
def get_observation(function_name, args):
content = None
if function_name == "search":
mock_search_res = [
{"title": "t1", "url":"url1", "snippet": "snippet_content_1"},
{"title": "t2", "url":"url2", "snippet": "snippet_content_2"}
]
content = "\n\n".join([f"【{i}†{res['title']}†{res['url']}\n{res['snippet']}】"] for i, res in enumerate(mock_search_res))
elif function_name == "click":
mock_click_res = "main content"
content = mock_click_res
elif function_name == "open":
mock_open_res = "main_content"
content = mock_open_res
else:
raise ValueError("unspport function name!")
return content
def get_func_name_args(llm_text):
function_call = re.sub(r'.*?', '', llm_text, flags=re.DOTALL)
function_call = json.loads(function_call)
action = function_call['name']
params = function_call['arguments']
return action, params
def pipeline():
end_str = "{\"name\": \"finish\", \"arguments\": {}}"
response = get_assistant()
print("--------response----------", response)
messages.append({"role": "assistant", "content": response})
max_turns, turns = 35, 1
while not response.endswith(end_str) and turns ', '', llm_text, flags=re.DOTALL)
function_call = json.loads(function_call)
action = function_call['name']
params = function_call['arguments']
return action, params
def pipeline():
end_str = "{\"name\": \"finish\", \"arguments\": {}}"
response = get_assistant()
print("--------response----------", response)
messages.append({"role": "assistant", "content": response})
max_turns, turns = 35, 1
while not response.endswith(end_str) and turns