FlagEmbedding
FlagEmbedding copied to clipboard
np.save 带来的误差影响
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from FlagEmbedding import BGEM3FlagModel
import pandas as pd
import numpy as np
if __name__ == "__main__":
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
df = pd.read_parquet("00000.parquet")
npy_data = np.load("00000.npy")
texts_all = [temp["text"] for _, temp in df.iterrows()]
for index, text in enumerate(texts_all):
print(text[:100])
embedding1 = model.encode([text], max_length=2048)['dense_vecs'][0]
embedding2 = model.encode([text], max_length=2048)['dense_vecs'][0]
test_embed = npy_data[index]
print(test_embed.dtype) # float16
print(embedding1.dtype) # float16
print(embedding2.dtype) # float16
print(embedding1.shape) # (1024,)
print(embedding2.shape) # (1024,)
print(test_embed.shape) # (1024,)
print(embedding1 == test_embed) # [False False False ... False False False]
print(embedding2 == test_embed) # [False False False ... False False False]
print(embedding2 == embedding1) # [ True True True ... True True True]
print(np.allclose(test_embed, embedding2, atol=1e-3)) # 允许 1e-3 误差 # False
similarity = embedding1 @ test_embed.T
print(similarity) # 0.999
break
测试了一下 结果如下所示,想问一下 np.save带来的误差影响有多少? 为什么会有那么大量的偏差?
你好,@dbcSep03。这个误差属于正常现象,在下游检索场景中,这种误差基本不会对检索结果产生较大影响。