BitNet
BitNet copied to clipboard
intel NPU is so slow
I used Intel's AI Boost, and it's too slow...,NPU uasge 37%
cd .\transformers-bitnet
pip install .
pip install torch accelerate intel-npu-acceleration-library gradio
import torch
import torch._dynamo
torch._dynamo.config.suppress_errors = True
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from intel_npu_acceleration_library import NPUModelForCausalLM
from intel_npu_acceleration_library.compiler import CompilerConfig
import gradio as gr
model_id = "microsoft/bitnet-b1.58-2B-4T"
tokenizer = AutoTokenizer.from_pretrained(model_id)
compiler_conf = CompilerConfig( dtype=torch.bfloat16)
# optimized_model = torch.compile(model_id, backend="npu")
model = NPUModelForCausalLM.from_pretrained(model_id, config=compiler_conf,use_cache=True,attn_implementation="sdpa").eval()
prompt = f"<s>用户:{user_input}\n助手:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextStreamer(tokenizer, skip_special_tokens=True)
outputs = model.generate(
**inputs,
do_sample=True,
temperature=0.5,
top_p=0.85,
repetition_penalty=1.1,
streamer=streamer,
top_k=50,
max_new_tokens=512
)
what is your current token generation speed? i am afraid we do not have NPU kernels at this moment.