self-rag
self-rag copied to clipboard
Out of memory at inference in free tier Google Colab
Tried the quick start code in free tier Google Colab. Got out of memory error. Is this expected ?
from vllm import LLM, SamplingParams
model = LLM("selfrag/selfrag_llama2_7b", download_dir="/gscratch/h2lab/akari/model_cache", dtype="half") sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=100, skip_special_tokens=False)
def format_prompt(input, paragraph=None):
prompt = "### Instruction:\n{0}\n\n### Response:\n".format(input)
if paragraph is not None:
prompt += "[Retrieval]
query_1 = "Leave odd one out: twitter, instagram, whatsapp." query_2 = "Can you tell me the difference between llamas and alpacas?" queries = [query_1, query_2]
for a query that doesn't require retrieval
preds = model.generate([format_prompt(query) for query in queries], sampling_params) for pred in preds: print("Model prediction: {0}".format(pred.outputs[0].text))