CTranslate2
CTranslate2 copied to clipboard
Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes
My initial testing comparing ct2 (using int8) and the bitsandbytes library at 4 and 8 bit...nicely done ctranslate2 people. Looking forward to testing GGUF in there as well.
Here's the relevant portion of my script omitting the prompt, imports, and private path on my computer to the models:
PARTIAL TEST SCRIPT
context_length = 4095
max_generation_length = 512
max_prompt_length = context_length - max_generation_length
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
warnings.filterwarnings("ignore", module="pynvml")
results = {}
for model_config in models:
model_dir = model_config["model_dir"]
build_prompt_func = model_config["build_prompt"]
model_name = os.path.basename(model_dir)
print(f"\033[32mLoading the model: {model_name}...\033[0m")
intra_threads = max(os.cpu_count() - 4, os.cpu_count())
generator = ctranslate2.Generator(model_dir, device="cuda", compute_type="int8", intra_threads=intra_threads)
sp = spm.SentencePieceProcessor(os.path.join(model_dir, "tokenizer.model"))
model_results = []
for _ in range(3):
start_time = time.time()
dialog = [{"role": "user", "content": user_prompt}]
prompt_tokens = build_prompt_func(sp, dialog)
step_results = generator.generate_tokens(
prompt_tokens,
max_length=max_generation_length,
sampling_temperature=0.1,
sampling_topk=20,
sampling_topp=1,
)
memory_info_peak = pynvml.nvmlDeviceGetMemoryInfo(handle)
vram_usage_peak = memory_info_peak.used / 1024**2
print("", flush=True)
text_output = ""
num_generated_tokens = 0
for word in generate_words(sp, step_results):
if text_output:
word = " " + word
print(word, end="", flush=True)
text_output += word
num_generated_tokens += 1
print("")
end_time = time.time()
response_time = end_time - start_time
model_results.append({
"response_time": response_time,
"peak_vram_usage": vram_usage_peak
})
results[model_name] = model_results
del generator
del sp
gc.collect()
time.sleep(2)
pynvml.nvmlShutdown()
print("\nAverage Results:")
for model_name, model_results in results.items():
avg_response_time = sum(result['response_time'] for result in model_results) / len(model_results)
avg_peak_vram_usage = sum(result['peak_vram_usage'] for result in model_results) / len(model_results)
print(f"Model: {model_name}")
print(f"Average Response Time: {avg_response_time:.2f} seconds")
print(f"Average Peak VRAM Usage: {avg_peak_vram_usage:.2f} MB")
print()
def generate_words(sp, step_results):
tokens_buffer = []
for step_result in step_results:
is_new_word = step_result.token.startswith("▁")
if is_new_word and tokens_buffer:
word = sp.decode(tokens_buffer)
if word:
yield word
tokens_buffer = []
tokens_buffer.append(step_result.token_id)
if tokens_buffer:
word = sp.decode(tokens_buffer)
if word:
yield word
def build_prompt_solar_10_7b_instruct_v1_0(sp, dialog):
user_prompt = dialog[0]["content"]
system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
prompt = f"""### System:\n{system_message}\n\n### User:\n{user_prompt}\n\n### Assistant:\n"""
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
def build_prompt_neural_chat_7b_v3_3(sp, dialog):
system_prompt = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
user_prompt = dialog[0]["content"]
prompt = f"### System:\n{system_prompt}\n### User:\n{user_prompt}\n### Assistant: "
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
def build_prompt_llama_2_7b_chat(sp, dialog):
user_prompt = dialog[0]["content"]
system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
def build_prompt_llama_2_13b_chat(sp, dialog):
user_prompt = dialog[0]["content"]
system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
def build_prompt_mistral_7b_instruct_v0_2(sp, dialog):
user_prompt = dialog[0]["content"]
prompt = f"<s>[INST] {user_prompt} [/INST]</s>\n"
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
Let me know if anyone wants the full test script...EDIT, I ran it 5 times and changed the graph to only show models all backends were run with...think it's a better chart.
Updated graph here including llama.cpp, which, apparently, is faster but uses slightly more VRAM...except for the 13B model where it's 3GBhigher. Plus, the numbers changed somewhat because I ran each model 15 times instead of 3...
Ask if you're interested in the test scripts.