Baichuan-7B
Baichuan-7B copied to clipboard
给大家一个方便运行的程序代码(cli_demo.py),对多GPU支持更友好些,需要的可以复制过去跑一下
import os import platform import torch from transformers import AutoTokenizer, AutoModelForCausalLM
#特点: #1.自动支持cpu及gpu模式 #2.使用gpu时使用half模式载入,减少一半显存 #3.使用gpu时多显卡模式自动分布载入 #4.暂不支持 聊天上下文功能 #5.暂不支持 打字输出效果 (所以答案太长时会卡死,可以调整MAX_TOKENS来暂时解决) #作者: [email protected] 个人兴趣开发者/杨,有问题也可以邮我
def auto_configure_device_map(num_gpus: int): num_trans_layers = 32 per_gpu_layers = num_trans_layers / num_gpus device_map = {'model.embed_tokens': 0, 'model.norm': num_gpus-1, 'lm_head': num_gpus-1} for i in range(num_trans_layers): device_map[f'model.layers.{i}'] = int(i//per_gpu_layers) return device_map
#MODEL_NAME = "../baichuan-7B-model" MODEL_NAME = "baichuan-inc/baichuan-7B"
NUM_GPUS = torch.cuda.device_count() if torch.cuda.is_available() else None MAX_TOKENS = 512 device_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS>0 else None device = torch.device("cuda") if NUM_GPUS>0 else torch.device("cpu") device_dtype = torch.half if NUM_GPUS>0 else torch.float
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map=device_map, torch_dtype=device_dtype) model = model.eval()
os_name = platform.system() clear_command = 'cls' if os_name == 'Windows' else 'clear' hello_string = "欢迎使用 BaiChuan-7B 模型,输入内容即可进行对话,clear 清空对话历史,stop/exit/quit 终止程序"
def build_prompt(history): prompt = hello_string for query, response in history: prompt += f"\n\n用户: {query}" prompt += f"\n回复: {response}" return prompt
history = [] print(hello_string) while True:
query = input("\n用户: ")
if query.strip() in ["stop", "stop()", "exit", "exit()", "quit", "quit()", "q", "q()"]:
break
if query.strip() in ["clear", "clear()", "cls", "cls()"]:
history = []
os.system(clear_command)
print(hello_string)
continue
inputs = tokenizer(query, return_tensors='pt')
inputs.input_ids = inputs.input_ids.to(device)
inputs.attention_mask = inputs.attention_mask.to(device)
pred = model.generate(inputs=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=MAX_TOKENS, repetition_penalty=1.1)
response = tokenizer.decode(pred.cpu().tolist()[0])
response = response[len(query)+response.find(query):]
if response[-4:] == "</s>": response = response[:-4]
history += [(query, response)]
print(f"\n回复: {response}")
os.system(clear_command)
print(build_prompt(history), flush=True)
GPU使用了half浮点数,两块2080TI 11G可以流畅运行,两块10G的显卡也能运行起来,两块8G的不够。但是对电脑内存好像在载入瞬间要达到40多G(7G4+7G2),如果显存大内存小的可以去除half模式
报错: File "/home/hope/work/baichuan-7B/try2.py", line 15 num_trans_layers = 32 ^ IndentationError: expected an indented block after function definition on line 14
这个你要缩进一下,github的Issue不适合粘贴代码,你缩进一下就不会出错了。
谢谢啦!
稍微整理了下格式
import os
import platform
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
#特点:
#1.自动支持cpu及gpu模式
#2.使用gpu时使用half模式载入,减少一半显存
#3.使用gpu时多显卡模式自动分布载入
#4.暂不支持 聊天上下文功能
#5.暂不支持 打字输出效果 (所以答案太长时会卡死,可以调整MAX_TOKENS来暂时解决)
#作者: [email protected] 个人兴趣开发者/杨,有问题也可以邮我
def auto_configure_device_map(num_gpus: int):
num_trans_layers = 32
per_gpu_layers = num_trans_layers / num_gpus
device_map = {'model.embed_tokens': 0,
'model.norm': num_gpus-1, 'lm_head': num_gpus-1}
for i in range(num_trans_layers):
device_map[f'model.layers.{i}'] = int(i//per_gpu_layers)
return device_map
def build_prompt(history):
prompt = hello_string
for query, response in history:
prompt += f"\n\n用户: {query}"
prompt += f"\n回复: {response}"
return prompt
#MODEL_NAME = "../baichuan-7B-model"
MODEL_NAME = "baichuan-inc/baichuan-7B"
NUM_GPUS = torch.cuda.device_count() if torch.cuda.is_available() else None
MAX_TOKENS = 512
device_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS>0 else None
device = torch.device("cuda") if NUM_GPUS>0 else torch.device("cpu")
device_dtype = torch.half if NUM_GPUS>0 else torch.float
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map=device_map, torch_dtype=device_dtype)
model = model.eval()
os_name = platform.system()
clear_command = 'cls' if os_name == 'Windows' else 'clear'
hello_string = "欢迎使用 BaiChuan-7B 模型,输入内容即可进行对话,clear 清空对话历史,stop/exit/quit 终止程序"
history = []
print(hello_string)
while True:
query = input("\n用户: ")
if query.strip() in ["stop", "stop()", "exit", "exit()", "quit", "quit()", "q", "q()"]:
break
if query.strip() in ["clear", "clear()", "cls", "cls()"]:
history = []
os.system(clear_command)
print(hello_string)
continue
inputs = tokenizer(query, return_tensors='pt')
inputs.input_ids = inputs.input_ids.to(device)
inputs.attention_mask = inputs.attention_mask.to(device)
pred = model.generate(inputs=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=MAX_TOKENS, repetition_penalty=1.1)
response = tokenizer.decode(pred.cpu().tolist()[0])
response = response[len(query)+response.find(query):]
if response[-4:] == "</s>": response = response[:-4]
history += [(query, response)]
print(f"\n回复: {response}")
os.system(clear_command)
print(build_prompt(history), flush=True)
多谢啦,一会测试一下
稍微整理了下格式
import os import platform import torch from transformers import AutoTokenizer, AutoModelForCausalLM #特点: #1.自动支持cpu及gpu模式 #2.使用gpu时使用half模式载入,减少一半显存 #3.使用gpu时多显卡模式自动分布载入 #4.暂不支持 聊天上下文功能 #5.暂不支持 打字输出效果 (所以答案太长时会卡死,可以调整MAX_TOKENS来暂时解决) #作者: [email protected] 个人兴趣开发者/杨,有问题也可以邮我 def auto_configure_device_map(num_gpus: int): num_trans_layers = 32 per_gpu_layers = num_trans_layers / num_gpus device_map = {'model.embed_tokens': 0, 'model.norm': num_gpus-1, 'lm_head': num_gpus-1} for i in range(num_trans_layers): device_map[f'model.layers.{i}'] = int(i//per_gpu_layers) return device_map def build_prompt(history): prompt = hello_string for query, response in history: prompt += f"\n\n用户: {query}" prompt += f"\n回复: {response}" return prompt #MODEL_NAME = "../baichuan-7B-model" MODEL_NAME = "baichuan-inc/baichuan-7B" NUM_GPUS = torch.cuda.device_count() if torch.cuda.is_available() else None MAX_TOKENS = 512 device_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS>0 else None device = torch.device("cuda") if NUM_GPUS>0 else torch.device("cpu") device_dtype = torch.half if NUM_GPUS>0 else torch.float tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map=device_map, torch_dtype=device_dtype) model = model.eval() os_name = platform.system() clear_command = 'cls' if os_name == 'Windows' else 'clear' hello_string = "欢迎使用 BaiChuan-7B 模型,输入内容即可进行对话,clear 清空对话历史,stop/exit/quit 终止程序" history = [] print(hello_string) while True: query = input("\n用户: ") if query.strip() in ["stop", "stop()", "exit", "exit()", "quit", "quit()", "q", "q()"]: break if query.strip() in ["clear", "clear()", "cls", "cls()"]: history = [] os.system(clear_command) print(hello_string) continue inputs = tokenizer(query, return_tensors='pt') inputs.input_ids = inputs.input_ids.to(device) inputs.attention_mask = inputs.attention_mask.to(device) pred = model.generate(inputs=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=MAX_TOKENS, repetition_penalty=1.1) response = tokenizer.decode(pred.cpu().tolist()[0]) response = response[len(query)+response.find(query):] if response[-4:] == "</s>": response = response[:-4] history += [(query, response)] print(f"\n回复: {response}") os.system(clear_command) print(build_prompt(history), flush=True)
如上代码在A10里面一遍跑起来了,大概用到的一些命令
conda create -n ai python=3.10
conda activate ai
git clone https://github.com/baichuan-inc/baichuan-7B.git
cd baichuan-7B/
pip install -r requirements.txt
pip install accelerate
python ./cli_demo.py
@lilongthinker @lanny2018
为何我使用多gpu的时候会出现以下错误
使用dan单卡没问题。
代码如下: