langchain
langchain copied to clipboard
How to use return_source_documents to also extract similarity score??
I was using RetrievalQA.from_chain_type, to which I had passed parameters as:-
RetrievalQA.from_chain_type(llm, chain_type, retriever = chroma_db.as_retriever(), return_source_documents = True)
Here, return_source_documents = True, only returns the chunks from which it generated the response. Is there a way in which I can get similarity score also returned for matched chunks (say if there are 4 chunks it found most relevant to query, how to get scores in decreasing order based on similarity)
The QA-Chains doesn't seem to have any direct way to get the similarity_score
along with the source documents. You can directly call the chromadb
for the scores though.
chromadb.similarity_search_with_score(your_query)
Thanks for sharing this.
This code piece I have already tested, but wanted to check for RetrievalQA chain
I am also looking for return scores for a normal qa chain
Same here. Is there any way to get scores from a chain
+1 on this. I'd love to have a way to get the similarity scores from a chain.
I end up doing something like this
from typing import List
from langchain.schema import Document
from langchain.vectorstores.redis import RedisVectorStoreRetriever
class RedisVectorStoreRetrieverWithScores(RedisVectorStoreRetriever):
"""
Hacky way to create a redis retriever that adds the score to the metadata
"""
def get_relevant_documents(self, query: str) -> List[Document]:
# [NOTE] we removed the search type, only use search_type = "similarity"
if self.search_type != "similarity":
raise ValueError(f"Only search_type='similarity' is supported with scores")
docs_and_scores = self.vectorstore.similarity_search_with_score(query, k=self.k)
for doc, score in docs_and_scores:
doc.metadata = {**doc.metadata, **{"score": score}}
return [doc for (doc, _) in docs_and_scores]
for weaviate DB it's simple and i think it would be similar for others too.
just pass the additional params as seach_kwargs in retriever.
Weaviate().as_retriever(search_kwargs={"additional": ["vector", "certainty", "id"]})
This will give you vector, certainty score and uuid of documents under metadata of Documents
I am also looking for a solution, but in my case I am using pgvectore as my vectorstore, any idea here?
Me too looking this for pgvector.
I want to use not just k chunks from a db but the number of chunks that pass a threshold (in similarity score). Any idea how to do that?
Also looking for a solution to this for Annoy vectorstore
Elaborating on the answers from @FrancescoSaverioZuppichini I came up with this solution.
- replace
retriever = my_vector_store.as_retriever(search_type="similarity")
withretriever = RetrieverWithScores.from_vector_store(my_vector_store, search_type="similarity")
- Use a retrieval chain as the one below which explicitly extracts the similarity score from the
doc.metadata
def _get_context(docs) -> str:
return "\n\n".join(doc.page_content for doc in docs)
def _get_references(docs) -> str:
names = [doc.metadata["filename"] for doc in docs]
pages = [doc.metadata["page_number"] for doc in docs]
scores = [doc.metadata["similarity_core"] for doc in docs]
return "\n\n".join(f"{n+1} -> {name}, page {page}, relevance_score {score:.3f}"
for n, (name, page, score) in enumerate(zip(names, pages, scores)))
retrival_chain = (
RunnableParallel(
stand_alone_question=(summarization_prompt | llm_summarizer), # this is expecting: chat_history, last_question
)
| RunnableParallel(
stand_alone_question=itemgetter("stand_alone_question"),
retrieved_docs=(itemgetter("stand_alone_question") | retriever),
)
| RunnableParallel(
context=(itemgetter("retrieved_docs") | RunnableLambda(_get_context)),
stand_alone_question=itemgetter("stand_alone_question"),
references=(itemgetter("retrieved_docs") | RunnableLambda(_get_references)),
)
)
class RetrieverWithScores(VectorStoreRetriever):
"""
A retriever that returns documents with their similarity scores.
OLD:
retriever = my_vector_store.as_retriever(search_type="similarity")
NEW:
retriever = RetrieverWithScores.from_vector_store(my_vector_store, search_type="similarity")
"""
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
if self.search_type == "similarity":
docs_and_similarities = self.vectorstore._similarity_search_with_relevance_scores(query, **self.search_kwargs)
elif self.search_type == "similarity_score_threshold":
docs_and_similarities = (
self.vectorstore.similarity_search_with_relevance_scores(
query, **self.search_kwargs
)
)
elif self.search_type == "mmr":
docs = self.vectorstore.max_marginal_relevance_search(
query, **self.search_kwargs
)
docs_and_similarities = [(doc, 0.0) for doc in docs]
else:
raise ValueError(f"search_type of {self.search_type} not allowed.")
for doc, score in docs_and_similarities:
doc.metadata = {**doc.metadata, **{"similarity_core": score}}
return [doc for (doc, _) in docs_and_similarities]
@staticmethod
def from_vector_store(vector_store: VectorStore, **kwargs: Any) -> "RetrieverWithScores":
"""
Return VectorStoreRetriever initialized from this VectorStore.
This is basically a copy of VectorStore.as_retriever method.
"""
tags = kwargs.pop("tags", None) or []
tags.extend(vector_store._get_retriever_tags())
return RetrieverWithScores(vectorstore=vector_store, **kwargs, tags=tags)
My pretty straightforward proposal:
Change line 707 in file {python_env}\Lib\site-packages\langchain_core\vectorstores.py
Replace this:
docs = [doc for doc, _ in docs_and_similarities]
With this:
docs=[]
for doc, _ in docs_and_similarities:
doc.metadata["score"]=_
docs.append(doc)
this way you will have the score in the metadata dictionary of every shard