ChatGLM-webui
ChatGLM-webui copied to clipboard
实现了保存已量化模型,大幅加快启动速度,望合并
只做了4位的,8位同理。model.py中,相应函数改为以下内容。首次运行,需将firset_run改为1。 可在config中加入开关并自动检测保存状态。
def prepare_model():
import pickle
from transformers import AutoModel
global model
if cmd_opts.precision == "int4":
firset_run=0
if firset_run:
model = AutoModel.from_pretrained(cmd_opts.model_path, trust_remote_code=True)
model = model.half().quantize(4)
print("量化完毕")
with open(cmd_opts.model_path+"int4", 'wb') as f:
pickle.dump(model, f)
print("保存量化完毕")
else:
with open(cmd_opts.model_path+"int4", 'rb') as f:
model = pickle.load(f)
model = model.cuda()
model = model.eval()
return
model = AutoModel.from_pretrained(cmd_opts.model_path, trust_remote_code=True)
if cmd_opts.cpu:
model = model.float()
else:
if cmd_opts.precision == "fp16":
model = model.half().cuda()
elif cmd_opts.precision == "int8":
model = model.half().quantize(8).cuda()
model = model.eval()
def load_model():
if cmd_opts.ui_dev:
return
global tokenizer, model
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(cmd_opts.model_path, trust_remote_code=True)
prepare_model()
抱歉没有用commit,有一些修改不宜提交
可以专门分离一个保存量化后模型的脚本,然后在主脚本里加入使用预量化模型的选项
赞哦
pickle保存有些问题,我之后再想想怎么整比较好
@Akegarasu 官方已有4bit模型