deepeval
deepeval copied to clipboard
Cant use HuggingFace Model for evaluation
When i follow the example on this page: https://docs.confident-ai.com/docs/metrics-introduction
and try to use Mistral-7B as evaluation-model, i always get this error when running the exact code in the tutorial. It seems there is a mistake in the code when using HugigngFace models for evaluation instead of ChatGPT.
Error:
JSONDecodeError Traceback (most recent call last) File ~/.conda/envs/evaluation/lib/python3.12/site-packages/deepeval/metrics/utils.py:58, in trimAndLoadJson(input_string, metric) 57 try: ---> 58 return json.loads(jsonStr) 59 except json.JSONDecodeError:
File ~/.conda/envs/evaluation/lib/python3.12/json/init.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 343 if (cls is None and object_hook is None and 344 parse_int is None and parse_float is None and 345 parse_constant is None and object_pairs_hook is None and not kw): --> 346 return _default_decoder.decode(s) 347 if cls is None:
File ~/.conda/envs/evaluation/lib/python3.12/json/decoder.py:340, in JSONDecoder.decode(self, s, _w) 339 if end != len(s): --> 340 raise JSONDecodeError("Extra data", s, end) 341 return obj
JSONDecodeError: Extra data: line 4 column 1 (char 110)
During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) Cell In[4], line 18 ... ---> 63 raise ValueError(error_str) 64 except Exception as e: 65 raise Exception(f"An unexpected error occurred: {str(e)}")
ValueError: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.
Code: from transformers import AutoModelForCausalLM, AutoTokenizer from deepeval.models.base_model import DeepEvalBaseLLM import asyncio
class Mistral7B(DeepEvalBaseLLM): def init( self, model, tokenizer ): self.model = model self.tokenizer = tokenizer
def load_model(self):
return self.model
def generate(self, prompt: str) -> str:
model = self.load_model()
device = "cuda" # the device to load the model onto
model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
model.to(device)
generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
output = self.tokenizer.batch_decode(generated_ids)[0]
#result = f"{{ {output} }}"
return output
async def a_generate(self, prompt: str) -> str:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.generate, prompt)
def get_model_name(self):
return "Mistral 7B"
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_7b = Mistral7B(model=model, tokenizer=tokenizer)
from deepeval import evaluate from deepeval.metrics import AnswerRelevancyMetric from deepeval.test_case import LLMTestCase
Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra cost."
metric = AnswerRelevancyMetric( threshold=0.7, model=mistral_7b, include_reason=True ) test_case = LLMTestCase( input="What if these shoes don't fit?", actual_output=actual_output )
metric.measure(test_case) print(metric.score) print(metric.reason)
or evaluate test cases in bulk
evaluate([test_case], [metric])
Thanks for the help in advance and all the best!