transformers-bloom-inference
transformers-bloom-inference copied to clipboard
Inference(chatbot) does not work as expected on 2 gpus with bigscience/bloom-7b1 model
I am trying to create a simple chatbot using bloom-7b1 model (may use bigger models later) based on bloom-ds-zero-inference.py. Here is my code:
import os
from pathlib import Path
import deepspeed
import torch.distributed as dist
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
import torch
from transformers.utils import is_offline_mode
from huggingface_hub import snapshot_download
from transformers import StoppingCriteria, StoppingCriteriaList
local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
deepspeed.init_distributed("nccl")
rank = dist.get_rank()
def print_rank0(*msg):
if rank != 0:
return
print(*msg)
def get_repo_root(model_name_or_path):
# checks if online or not
if is_offline_mode():
print_rank0("Offline mode: forcing local_files_only=True")
# download only on first process
if rank == 0:
snapshot_download(
model_name_or_path,
local_files_only=is_offline_mode(),
cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
ignore_patterns=["*.safetensors"],
)
dist.barrier()
return snapshot_download(
model_name_or_path,
local_files_only=is_offline_mode(),
cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
ignore_patterns=["*.safetensors"],
)
def get_checkpoint_files(model_name_or_path):
cached_repo_dir = get_repo_root(model_name_or_path)
# extensions: .bin | .pt
# creates a list of paths from all downloaded files in cache dir
file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()]
return file_list
def write_checkpoints_json():
checkpoint_files = get_checkpoint_files(model_name)
if rank == 0:
data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0}
json.dump(data, open(checkpoints_json, "w"))
checkpoints_json = "checkpoints.json"
class StoppingCriteriaSub(StoppingCriteria):
def __init__(self, stops = [], encounters=1):
super().__init__()
self.stops = stops
self.ENCOUNTERS = encounters
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
stop_count = 0
for stop in self.stops:
tokens = input_ids[0]
stop_count = (stop == tokens).sum().item()
if stop_count >= self.ENCOUNTERS:
return True
return False
model_name = "bigscience/bloom-7b1"
infer_dtype = "float16"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
dtype = torch.float16
stop_words_ids = [
tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in ["Question:"]]
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=2)])
with deepspeed.OnDevice(dtype=dtype, device="meta"):
model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
model = model.eval()
repo_root = get_repo_root(model_name)
write_checkpoints_json()
dist.barrier()
kwargs = dict(replace_with_kernel_inject=True)
model = deepspeed.init_inference(
model,
mp_size=world_size,
base_dir=repo_root,
dtype=getattr(torch, infer_dtype),
checkpoint=checkpoints_json,
**kwargs,
)
model = model.module
def chatbot(question):
prompt = "You are an AI chatbot named Bobby. Your job is to answer questions related to cartoon characters. Respond 'not sure' if unsure about answer.\n"
prompt += "Question:" + " " + question + "\n" + "Answer:" + " "
num_tokens = 100
inputs = [prompt]
generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False, stopping_criteria=stopping_criteria)
input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
outputs = model.generate(**input_tokens, **generate_kwargs)
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(outputs)
if __name__ == "__main__":
while True:
question = input("You: ")
if question == "q":
break
answer = chatbot(question)
I have not yet applied the post-processing of the output. This works fine if I run it with
deepspeed --num_gpus 1 inference.py
but when I run it with
deepspeed --num_gpus 2 inference.py nothing seems to happen, and on pressing enter, memory stats come up. Have to exit with Ctrl+C.
I am using two Tesla V100 GPUs. deepspeed==0.9.2 and torch==1.14.0a0+410ce96 and Python 3.8.10
Um, I am not sure. Maybe ur process is getting stuck somewhere?
I think the issue is the input is not share to the second GPU. I have a similar issue with microsoft/bloom-deepspeed-inference-int8, if I repeat the input X time (X = # of GPUs) it will keep inference to get the output.
Instead, I tried the bigscience/bloom based on bloom-accelerate-inference.py it works well with interactive input.