ChatGLM-webui 实现了保存已量化模型，大幅加快启动速度，望合并

实现了保存已量化模型，大幅加快启动速度，望合并

Open l15y opened this issue 2 years ago • 2 comments

只做了4位的，8位同理。model.py中，相应函数改为以下内容。首次运行，需将firset_run改为1。可在config中加入开关并自动检测保存状态。

def prepare_model():
    import pickle
    from transformers import AutoModel
    global model
    if cmd_opts.precision == "int4":
        firset_run=0
        if firset_run:
            model = AutoModel.from_pretrained(cmd_opts.model_path, trust_remote_code=True)
            model = model.half().quantize(4)
            print("量化完毕")
            with open(cmd_opts.model_path+"int4", 'wb') as f:
                pickle.dump(model, f)
            print("保存量化完毕")
        else:
            with open(cmd_opts.model_path+"int4", 'rb') as f:
                model = pickle.load(f)
        model = model.cuda()
        model = model.eval()
        return

    model = AutoModel.from_pretrained(cmd_opts.model_path, trust_remote_code=True)
    if cmd_opts.cpu:
        model = model.float()
    else:
        if cmd_opts.precision == "fp16":
            model = model.half().cuda()
        elif cmd_opts.precision == "int8":
            model = model.half().quantize(8).cuda()
    model = model.eval()


def load_model():
    if cmd_opts.ui_dev:
        return

    global tokenizer, model
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(cmd_opts.model_path, trust_remote_code=True)
    prepare_model()