Chinese-CLIP
Chinese-CLIP copied to clipboard
中文和英文同时匹配图片时中文得分非常低
当对一张图片与中文和英文的文本分别相似度计算时, 中文文本的得分远小于英文的得分.
import torch
from PIL import Image
import requests
import cn_clip.clip as clip
from cn_clip.clip import load_from_name, available_models
print("Available models:", available_models())
# Available models: ['ViT-B-16', 'ViT-L-14', 'ViT-L-14-336', 'ViT-H-14', 'RN50']
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = load_from_name("ViT-L-14-336", device=device, download_root='checkpoint')
model.eval()
img_path = "examples/dog.jpg"
text_list = ["猫", "狗", "dog"]
image = preprocess(Image.open(str(img_path))).unsqueeze(0).to(device)
text = clip.tokenize(text_list).to(device)
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
logits_per_image, logits_per_text = model.get_similarity(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
for i, prompt in enumerate(text_list):
print(f"{prompt}: {probs[0][i]}")
结果为: 猫: 4.565715789794922e-05 狗: 0.010009765625 dog: 0.98974609375
这种问题怎么解决?