transformers-bloom-inference Inference(chatbot) does not work as expected on 2 gpus with bigscience/bloom-7b1 model

I am trying to create a simple chatbot using bloom-7b1 model (may use bigger models later) based on bloom-ds-zero-inference.py. Here is my code:

import os
from pathlib import Path
import deepspeed
import torch.distributed as dist
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
import torch
from transformers.utils import is_offline_mode
from huggingface_hub import snapshot_download
from transformers import StoppingCriteria, StoppingCriteriaList


local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))

deepspeed.init_distributed("nccl")
rank = dist.get_rank()

def print_rank0(*msg):
    if rank != 0:
        return
    print(*msg)

def get_repo_root(model_name_or_path):
    # checks if online or not
    if is_offline_mode():
        print_rank0("Offline mode: forcing local_files_only=True")

    # download only on first process
    if rank == 0:
        snapshot_download(
            model_name_or_path,
            local_files_only=is_offline_mode(),
            cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
            ignore_patterns=["*.safetensors"],
        )

    dist.barrier()

    return snapshot_download(
        model_name_or_path,
        local_files_only=is_offline_mode(),
        cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
        ignore_patterns=["*.safetensors"],
    )

def get_checkpoint_files(model_name_or_path):
    cached_repo_dir = get_repo_root(model_name_or_path)

    # extensions: .bin | .pt
    # creates a list of paths from all downloaded files in cache dir
    file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()]
    return file_list

def write_checkpoints_json():
    checkpoint_files = get_checkpoint_files(model_name)
    if rank == 0:
        data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0}
        json.dump(data, open(checkpoints_json, "w"))

checkpoints_json = "checkpoints.json"

class StoppingCriteriaSub(StoppingCriteria):
    def __init__(self, stops = [], encounters=1):
      super().__init__()
      self.stops = stops
      self.ENCOUNTERS = encounters

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
      stop_count = 0
      for stop in self.stops:
            tokens = input_ids[0]
            stop_count = (stop == tokens).sum().item()

      if stop_count >= self.ENCOUNTERS:
          return True
      return False

model_name = "bigscience/bloom-7b1"
infer_dtype = "float16"

tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

dtype = torch.float16

stop_words_ids = [
    tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in ["Question:"]]

stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=2)])

with deepspeed.OnDevice(dtype=dtype, device="meta"):
    model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)

model = model.eval()
repo_root = get_repo_root(model_name)
write_checkpoints_json()
dist.barrier()
kwargs = dict(replace_with_kernel_inject=True)
model = deepspeed.init_inference(
    model,
    mp_size=world_size,
    base_dir=repo_root,
    dtype=getattr(torch, infer_dtype),
    checkpoint=checkpoints_json,
    **kwargs,
)
model = model.module

def chatbot(question):
    prompt = "You are  an AI chatbot named Bobby. Your job is to answer questions related to cartoon characters. Respond 'not sure' if unsure about answer.\n"
    prompt += "Question:" + " " + question + "\n" + "Answer:" + " "
    num_tokens = 100
    inputs = [prompt]
    generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False, stopping_criteria=stopping_criteria)
    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
    outputs = model.generate(**input_tokens, **generate_kwargs)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(outputs)

if __name__ == "__main__":
    while True:
        question = input("You: ")
        if question == "q":
            break
        answer = chatbot(question)

I have not yet applied the post-processing of the output. This works fine if I run it with

deepspeed --num_gpus 1 inference.py

but when I run it with

deepspeed --num_gpus 2 inference.py nothing seems to happen, and on pressing enter, memory stats come up. Have to exit with Ctrl+C.

I am using two Tesla V100 GPUs. deepspeed==0.9.2 and torch==1.14.0a0+410ce96 and Python 3.8.10

May 19 '23 11:05 dantalyon

Um, I am not sure. Maybe ur process is getting stuck somewhere?

May 31 '23 22:05 mayank31398

I think the issue is the input is not share to the second GPU. I have a similar issue with microsoft/bloom-deepspeed-inference-int8, if I repeat the input X time (X = # of GPUs) it will keep inference to get the output.

Instead, I tried the bigscience/bloom based on bloom-accelerate-inference.py it works well with interactive input.

Sep 22 '23 03:09 sifei

transformers-bloom-inference transformers-bloom-inference copied to clipboard

Inference(chatbot) does not work as expected on 2 gpus with bigscience/bloom-7b1 model

transformers-bloom-inference
transformers-bloom-inference copied to clipboard