djl icon indicating copy to clipboard operation
djl copied to clipboard

deepspeed model.py example for question-answering task

Open greenpau opened this issue 2 years ago • 1 comments

Hi All,

Currently, this is my custom model.py for text-generation task, which uses AutoModelForCausalLM.

Could you please share a sample custom model.py for question-answering task?

%%writefile {model_base_path}/code/model.py
from djl_python import Input, Output
import deepspeed
import torch
import logging
import math
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer



def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location)
    
    model = AutoModelForCausalLM.from_pretrained(model_location,
                                                 low_cpu_mem_usage=True)
    if "dtype" in properties:
        if properties["dtype"] == "float16":
            model.to(torch.float16)
        if properties["dtype"] == "bfloat16":
            model.to(torch.bfloat16)
    
    logging.info(f"Starting DeepSpeed init with TP={tensor_parallel}")
    model = deepspeed.init_inference(model,
                                     mp_size=tensor_parallel,
                                     dtype=model.dtype,
                                     trust_remote_code=True,
                                     replace_method='auto',
                                     replace_with_kernel_inject=True)
    return model.module, tokenizer


model = None
tokenizer = None
generator = None


def run_inference(model, tokenizer, data, params):
    generate_kwargs = params
    tokenizer.pad_token = tokenizer.eos_token
    input_tokens = tokenizer.batch_encode_plus(data,
                                               return_tensors="pt",
                                               padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
    outputs = model.generate(**input_tokens, **generate_kwargs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    params = data["parameters"]
    
    outputs = run_inference(model, tokenizer, input_sentences, params)
    result = {"outputs": outputs}
    return Output().add_as_json(result)

greenpau avatar Jun 15 '23 13:06 greenpau

@greenpau

For question answer task, you have many choices:

  1. You can run question answering task with pure java: https://github.com/deepjavalibrary/djl-demo/blob/master/huggingface/nlp/src/main/java/com/examples/QuestionAnswering.java
  2. can use our built-in huggingface.py
  3. You can also find example with customized model.py: https://github.com/deepjavalibrary/djl-demo/tree/master/huggingface/python/question_answering

frankfliu avatar Jun 20 '23 17:06 frankfliu