error: answer_relevancy with Bedrock model
Describe the bug I am getting an error when using Bedrock embedding model with answer relevancy metric.
Ragas version: 0.0.21 Python version: 3.10.8
Code to Reproduce from ragas import evaluate from ragas.metrics import answer_relevancy import nest_asyncio # CHECK NOTES
NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.
nest_asyncio.apply()
result = evaluate( amnesty_qa["eval"].select(range(3)), metrics=[answer_relevancy], llm=bedrock_model, embeddings=bedrock_embeddings, )
result
Error trace
ValidationException Traceback (most recent call last) File /usr/local/lib/python3.10/site-packages/langchain_community/embeddings/bedrock.py:131, in BedrockEmbeddings._embedding_func(self, text) 129 try: 130 # invoke bedrock API --> 131 response = self.client.invoke_model( 132 body=body, 133 modelId=self.model_id, 134 accept="application/json", 135 contentType="application/json", 136 ) 138 # format output based on provider
File /usr/local/lib/python3.10/site-packages/botocore/client.py:553, in ClientCreator._create_api_method.
File /usr/local/lib/python3.10/site-packages/botocore/client.py:1009, in BaseClient._make_api_call(self, operation_name, api_params) 1008 error_class = self.exceptions.from_code(error_code) -> 1009 raise error_class(parsed_response, operation_name) 1010 else:
ValidationException: An error occurred (ValidationException) when calling the InvokeModel operation: Malformed input request: expected minLength: 1, actual: 0, please reformat your input and try again.
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last) Cell In[42], line 1 ----> 1 result = evaluate( 2 amnesty_qa["eval"].select(range(3)), 3 metrics=[answer_relevancy], 4 llm=bedrock_model, 5 embeddings=bedrock_embeddings, 6 )
File ~/Projects/ragas/src/ragas/evaluation.py:204, in evaluate(dataset, metrics, llm, embeddings, callbacks, is_async, max_workers, raise_exceptions, column_map) 201 if not evaluation_group_cm.ended: 202 evaluation_rm.on_chain_error(e) --> 204 raise e 205 else: 206 result = Result( 207 scores=Dataset.from_list(scores), 208 dataset=dataset, 209 binary_columns=binary_metrics, 210 )
File ~/Projects/ragas/src/ragas/evaluation.py:187, in evaluate(dataset, metrics, llm, embeddings, callbacks, is_async, max_workers, raise_exceptions, column_map) 184 scores = [] 185 try: 186 # get the results --> 187 results = executor.results() 188 # convert results to dataset_like 189 for i, _ in enumerate(dataset):
File ~/Projects/ragas/src/ragas/executor.py:119, in Executor.results(self) 117 r = (-1, np.nan) 118 if self.raise_exceptions: --> 119 raise e 120 finally: 121 results.append(r)
File ~/Projects/ragas/src/ragas/executor.py:115, in Executor.results(self) 113 r = (-1, np.nan) 114 try: --> 115 r = future.result() 116 except Exception as e: 117 r = (-1, np.nan)
File /usr/local/Cellar/python@3.10/3.10.8/Frameworks/Python.framework/Versions/3.10/lib/python3.10/concurrent/futures/_base.py:451, in Future.result(self, timeout) 449 raise CancelledError() 450 elif self._state == FINISHED: --> 451 return self.__get_result() 453 self._condition.wait(timeout) 455 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
File /usr/local/Cellar/python@3.10/3.10.8/Frameworks/Python.framework/Versions/3.10/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self) 401 if self._exception: 402 try: --> 403 raise self._exception 404 finally: 405 # Break a reference cycle with the exception in self._exception 406 self = None
File /usr/local/Cellar/python@3.10/3.10.8/Frameworks/Python.framework/Versions/3.10/lib/python3.10/concurrent/futures/thread.py:58, in _WorkItem.run(self) 55 return 57 try: ---> 58 result = self.fn(*self.args, **self.kwargs) 59 except BaseException as exc: 60 self.future.set_exception(exc)
File ~/Projects/ragas/src/ragas/executor.py:37, in Executor.wrap_callable_with_index.
File ~/Projects/ragas/src/ragas/metrics/base.py:76, in Metric.score(self, row, callbacks) 74 if not group_cm.ended: 75 rm.on_chain_error(e) ---> 76 raise e 77 else: 78 if not group_cm.ended:
File ~/Projects/ragas/src/ragas/metrics/base.py:72, in Metric.score(self, row, callbacks) 68 rm, group_cm = new_group( 69 self.name, inputs=row, callbacks=callbacks, is_async=False 70 ) 71 try: ---> 72 score = self._score(row=row, callbacks=group_cm) 73 except Exception as e: 74 if not group_cm.ended:
File ~/Projects/ragas/src/ragas/metrics/_answer_relevance.py:133, in AnswerRelevancy._score(self, row, callbacks) 124 result = self.llm.generate_text( 125 prompt, 126 n=self.strictness, 127 callbacks=callbacks, 128 ) 129 response = [ 130 json_loader.safe_load(r.text, self.llm) for r in result.generations[0] 131 ] --> 133 return self._calculate_score(response, row)
File ~/Projects/ragas/src/ragas/metrics/_answer_relevance.py:111, in AnswerRelevancy._calculate_score(self, response, row) 101 gen_questions = [ 102 item.get("question", "") for item in response if isinstance(item, dict) 103 ] 104 committal = np.any( 105 [ 106 item.get("noncommittal", False) (...) 109 ] 110 ) --> 111 cosine_sim = self.calculate_similarity(question, gen_questions) 112 score = cosine_sim.mean() * int(not committal) 114 return score
File ~/Projects/ragas/src/ragas/metrics/_answer_relevance.py:87, in AnswerRelevancy.calculate_similarity(self, question, generated_questions) 84 assert self.embeddings is not None 85 question_vec = np.asarray(self.embeddings.embed_query(question)).reshape(1, -1) 86 gen_question_vec = np.asarray( ---> 87 self.embeddings.embed_documents(generated_questions) 88 ) 89 norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm( 90 question_vec, axis=1 91 ) 92 return ( 93 np.dot(gen_question_vec, question_vec.T).reshape( 94 -1, 95 ) 96 / norm 97 )
File /usr/local/lib/python3.10/site-packages/langchain_community/embeddings/bedrock.py:159, in BedrockEmbeddings.embed_documents(self, texts) 157 results = [] 158 for text in texts: --> 159 response = self._embedding_func(text) 160 results.append(response) 161 return results
File /usr/local/lib/python3.10/site-packages/langchain_community/embeddings/bedrock.py:146, in BedrockEmbeddings._embedding_func(self, text) 144 return response_body.get("embedding") 145 except Exception as e: --> 146 raise ValueError(f"Error raised by inference endpoint: {e}")
ValueError: Error raised by inference endpoint: An error occurred (ValidationException) when calling the InvokeModel operation: Malformed input request: expected minLength: 1, actual: 0, please reformat your input and try again.
the same here for Python 3.11.4 ragas 0.0.22
I'm working with bedrock claude model as well. Can you please share what embedding model you used here? One things i dont really understand is if we using bedrock model, then we have to use embedding model from bedrock? Should this not be matching the embedding model actually used for inference?
And also did you obeserved that with bedrock ofthen the metric result i obtained will be "null", "nan" for faithfullness and context_precision. "context_precision":null,"faithfulness":null,"answer_relevancy":0.7777600149
hey @shahules786 made some improvements here, could you take a look?