WizardLM
WizardLM copied to clipboard
How can I use deepspeed for inference
here is my GPUs info:
model name:
WizardCoder-15B-V1.0
here is my pkgs info:
Python 3.10.9 cuda 11.7 torch 1.13.1+cu117 transformers 4.28.1
here is my ds_report:
here is my cmd:
deepspeed --num_gpus 2 ds-WizardCoder.py --base_model my_model_path
here is my inference code:
import sys
import os
import fire
import time
import torch
import transformers
import json
import deepspeed
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from deepspeed.runtime.utils import see_memory_usage
query='Write a Python code to count 1 to 10.'
def evaluate(
batch_data,
tokenizer,
model,
input=None,
temperature=1,
top_p=0.9,
top_k=40,
num_beams=1,
max_new_tokens=2048,
**kwargs,
):
prompts = generate_prompt(batch_data, input)
inputs = tokenizer(prompts, return_tensors="pt", max_length=256, truncation=True, padding=True)
input_ids = inputs["input_ids"].to("cuda")
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
**kwargs,
)
#with torch.no_grad():
# generation_output = model.generate(
#input_ids=input_ids,
#generation_config=generation_config,
#return_dict_in_generate=True,
#output_scores=True,
#max_new_tokens=max_new_tokens,
#)
generated_output = model.generate(
input_ids_inputids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_enw_tokens=max_new_tokens,
)
s = generation_output.sequences
output = tokenizer.batch_decode(s, skip_special_tokens=True)
return output
def generate_prompt(instruction, input=None):
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""
def main(
load_8bit: bool = False,
base_model: str = "Model_Path",
):
assert base_model, (
"Please specify a --base_model, e.g. --base_model='bigcode/starcoder'"
)
local_rank = int(os.environ.get('LOCAL_RANK', 0))
world_size = int(os.environ.get('WORLD_SIZE', 1))
print(f'local_rank: {local_rank}, world_size: {world_size}')
if local_rank == 0:
see_memory_usage("before init", True)
deepspeed.init_distributed()
t0 = time.time()
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(
base_model,
load_in_8bit=load_8bit,
torch_dtype=torch.float16,
device_map="auto",
)
model.config.pad_token_id = tokenizer.pad_token_id
if not load_8bit:
model.half()
model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)
if local_rank == 0:
print(f"initialization time: {(time.time()-t0) * 1000}ms")
see_memory_usage("after init", True)
model = deepspeed.init_inference(
model,
mp_size=world_size,
dtype=torch.half,
#max_tokens=args.max_tokens,
replace_with_kernel_inject=True
)
if local_rank == 0:
see_memory_usage("after init_inference", True)
_output = evaluate(query, tokenizer, model)
final_output = _output[0].split("### Response:")[1].strip()
print(f'out:{final_output}')
if __name__ == "__main__":
fire.Fire(main)
I get a error :
Traceback (most recent call last): File "/home/liding/work/DeepSpeedExamples/inference/huggingface/text-generation/ds-WizardCoder.py", line 122, in <module> fire.Fire(main) File "/home/liding/work/ds_venv/lib/python3.10/site-packages/fire/core.py", line 141, in Fire component_trace = _Fire(component, args, parsed_flag_args, context, name) File "/home/liding/work/ds_venv/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire component, remaining_args = _CallAndUpdateTrace( File "/home/liding/work/ds_venv/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace component = fn(*varargs, **kwargs) File "/home/liding/work/DeepSpeedExamples/inference/huggingface/text-generation/ds-WizardCoder.py", line 88, in main model = AutoModelForCausalLM.from_pretrained( File "/home/liding/work/ds_venv/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 471, in from_pretrained return model_class.from_pretrained( File "/home/liding/work/ds_venv/lib/python3.10/site-packages/transformers/modeling_utils.py", line 2795, in from_pretrained ) = cls._load_pretrained_model( File "/home/liding/work/ds_venv/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3123, in _load_pretrained_model new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model( File "/home/liding/work/ds_venv/lib/python3.10/site-packages/transformers/modeling_utils.py", line 698, in _load_state_dict_into_meta_model set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs) File "/home/liding/work/ds_venv/lib/python3.10/site-packages/accelerate/utils/modeling.py", line 313, in set_module_tensor_to_device new_value = value.to(device) torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB (GPU 0; 22.06 GiB total capacity; 8.58 GiB already allocated; 75.38 MiB free; 8.58 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
I also encountered a similar problem
I also encountered the same problem when using deepspeed
I also encountered the same problem when using deepspeed
May I ask if it has been resolved? I also have this problem