fastembed
fastembed copied to clipboard
incorrect nomic embeddings
I was comparing the nomic embeddings and they are very different from the original version.
import pandas as pd
from more_itertools import chunked
from typing import List
import numpy as np
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
import torch
import os
from tqdm.notebook import tqdm
import json
from fastembed import SparseTextEmbedding, TextEmbedding
assert torch.cuda.is_available()
SEED = 25
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
def embed(texts: List[str]):
embeddings = model.encode(["clustering: " + t for t in texts], convert_to_tensor=True)
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
embeddings = F.normalize(embeddings, p=2, dim=1)
return embeddings.cpu().numpy()
import types
embedding_model = TextEmbedding(model_name="nomic-ai/nomic-embed-text-v1.5")
def embed_fast(texts: List[str]):
embeddings = embedding_model.embed(["clustering: " + t for t in texts])
# Force computation if embed_func returns a generator
if isinstance(embeddings, types.GeneratorType):
embeddings = np.array(list(embeddings))
return embeddings
res1 = embed(data)
res1
array([[ 0.0478127 , 0.07791077, -0.16337295, ..., -0.09588917,
-0.01815554, -0.0391101 ],
[ 0.00486873, 0.05552602, -0.17271836, ..., -0.06137123,
-0.01570066, 0.00191791],
[ 0.06523891, 0.0010743 , -0.18216555, ..., -0.07239737,
-0.04038522, -0.01559034],
...,
[ 0.00218753, 0.07514183, -0.20761742, ..., -0.06362353,
-0.01809935, -0.05537352],
[ 0.04343107, 0.09780316, -0.18920647, ..., -0.04333348,
-0.00627593, -0.05098606],
[ 0.07419404, -0.00313356, -0.17786372, ..., -0.12185868,
0.00074316, -0.03453541]], dtype=float32)
res2 = embed_fast(data)
res2
array([[ 0.02929905, 0.04431156, -0.15961514, ..., -0.06745377,
-0.05103005, -0.03928511],
[ 0.01270135, 0.05443013, -0.17877947, ..., -0.07663167,
-0.02374521, -0.01003957],
[ 0.04591728, 0.03366161, -0.22145797, ..., -0.06816148,
-0.04856569, -0.00438772],
...,
[ 0.00895302, 0.06434014, -0.22018844, ..., -0.06492045,
-0.02143181, -0.06246027],
[ 0.02247173, 0.06561402, -0.19789155, ..., -0.05780106,
-0.01238827, -0.0473954 ],
[ 0.05192114, 0.00191367, -0.1852103 , ..., -0.10932592,
-0.01199157, -0.03163441]], dtype=float32)
# compute cosine similarity between nth element in res1 and res2
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(res1[0].reshape(1, -1), res2[0].reshape(1, -1))
array([[0.85182214]], dtype=float32)
so we can see that the cosine similarity is only 0.85, the vectors are completely different.
The fastembed normalize https://github.com/qdrant/fastembed/blob/main/fastembed/common/models.py#L49-L54 does not follow the normalization in https://huggingface.co/nomic-ai/nomic-embed-text-v1.5#sentence-transformers it seems
Ohh, we missed this completely in our tests — I'll look into this. Thanks a ton for reporting this!
Same for nomic embed v1. Any planned resolution on this?
Hey @k4u5h1k, sorry for the late response, yes, we're working on it, we'll fix it soon
HI, I just added on comment on the PR that was merged (See https://github.com/qdrant/fastembed/pull/280#issuecomment-2276112677)
The addition of the mean_pool solved cosine similarity issue but there are still inconsistencies about the normalization (especially when comparing to Nomic's documentation), I think this issue should be reopened in the mean time.