CTranslate2 icon indicating copy to clipboard operation
CTranslate2 copied to clipboard

Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes

Open BBC-Esq opened this issue 1 year ago • 1 comments

My initial testing comparing ct2 (using int8) and the bitsandbytes library at 4 and 8 bit...nicely done ctranslate2 people. Looking forward to testing GGUF in there as well.

image

Here's the relevant portion of my script omitting the prompt, imports, and private path on my computer to the models:

PARTIAL TEST SCRIPT
    context_length = 4095
    max_generation_length = 512
    max_prompt_length = context_length - max_generation_length

    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    warnings.filterwarnings("ignore", module="pynvml")

    results = {}

    for model_config in models:
        model_dir = model_config["model_dir"]
        build_prompt_func = model_config["build_prompt"]

        model_name = os.path.basename(model_dir)
        print(f"\033[32mLoading the model: {model_name}...\033[0m")
        intra_threads = max(os.cpu_count() - 4, os.cpu_count())
        generator = ctranslate2.Generator(model_dir, device="cuda", compute_type="int8", intra_threads=intra_threads)
        sp = spm.SentencePieceProcessor(os.path.join(model_dir, "tokenizer.model"))

        model_results = []

        for _ in range(3):
            start_time = time.time()
            dialog = [{"role": "user", "content": user_prompt}]
            prompt_tokens = build_prompt_func(sp, dialog)
            step_results = generator.generate_tokens(
                prompt_tokens,
                max_length=max_generation_length,
                sampling_temperature=0.1,
                sampling_topk=20,
                sampling_topp=1,
            )

            memory_info_peak = pynvml.nvmlDeviceGetMemoryInfo(handle)
            vram_usage_peak = memory_info_peak.used / 1024**2

            print("", flush=True)
            text_output = ""
            num_generated_tokens = 0
            for word in generate_words(sp, step_results):
                if text_output:
                    word = " " + word
                print(word, end="", flush=True)
                text_output += word
                num_generated_tokens += 1
            print("")

            end_time = time.time()
            response_time = end_time - start_time

            model_results.append({
                "response_time": response_time,
                "peak_vram_usage": vram_usage_peak
            })

        results[model_name] = model_results

        del generator
        del sp
        gc.collect()

        time.sleep(2)

    pynvml.nvmlShutdown()

    print("\nAverage Results:")
    for model_name, model_results in results.items():
        avg_response_time = sum(result['response_time'] for result in model_results) / len(model_results)
        avg_peak_vram_usage = sum(result['peak_vram_usage'] for result in model_results) / len(model_results)
        print(f"Model: {model_name}")
        print(f"Average Response Time: {avg_response_time:.2f} seconds")
        print(f"Average Peak VRAM Usage: {avg_peak_vram_usage:.2f} MB")
        print()

def generate_words(sp, step_results):
    tokens_buffer = []
    for step_result in step_results:
        is_new_word = step_result.token.startswith("▁")
        if is_new_word and tokens_buffer:
            word = sp.decode(tokens_buffer)
            if word:
                yield word
            tokens_buffer = []
        tokens_buffer.append(step_result.token_id)
    if tokens_buffer:
        word = sp.decode(tokens_buffer)
        if word:
            yield word

def build_prompt_solar_10_7b_instruct_v1_0(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"""### System:\n{system_message}\n\n### User:\n{user_prompt}\n\n### Assistant:\n"""
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_neural_chat_7b_v3_3(sp, dialog):
    system_prompt = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    user_prompt = dialog[0]["content"]
    prompt = f"### System:\n{system_prompt}\n### User:\n{user_prompt}\n### Assistant: "
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_llama_2_7b_chat(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_llama_2_13b_chat(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens
    
def build_prompt_mistral_7b_instruct_v0_2(sp, dialog):
    user_prompt = dialog[0]["content"]
    prompt = f"<s>[INST] {user_prompt} [/INST]</s>\n"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

Let me know if anyone wants the full test script...EDIT, I ran it 5 times and changed the graph to only show models all backends were run with...think it's a better chart.

BBC-Esq avatar Mar 27 '24 09:03 BBC-Esq

Updated graph here including llama.cpp, which, apparently, is faster but uses slightly more VRAM...except for the 13B model where it's 3GBhigher. Plus, the numbers changed somewhat because I ran each model 15 times instead of 3...

image

Ask if you're interested in the test scripts.

BBC-Esq avatar Mar 28 '24 18:03 BBC-Esq