[BUG]can not run official eva-clip inference sdk with cpu device

Open PancakeAwesome opened this issue 2 years ago • 0 comments

Describe the bug

Traceback (most recent call last): File "evaclip_infer.py", line 86, in main() File "evaclip_infer.py", line 61, in main image_features = model.encode_image(image) File "/mnt/EVA/EVA-CLIP/rei/eva_clip/model.py", line 297, in encode_image features = self.visual(image) File "/opt/conda/envs/torch2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/mnt/EVA/EVA-CLIP/rei/eva_clip/eva_vit_model.py", line 523, in forward x = self.forward_features(x) File "/mnt/EVA/EVA-CLIP/rei/eva_clip/eva_vit_model.py", line 510, in forward_features x = blk(x, rel_pos_bias=rel_pos_bias) File "/opt/conda/envs/torch2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/mnt/EVA/EVA-CLIP/rei/eva_clip/eva_vit_model.py", line 287, in forward x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) File "/opt/conda/envs/torch2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/mnt/EVA/EVA-CLIP/rei/eva_clip/eva_vit_model.py", line 202, in forward x = xops.memory_efficient_attention( File "/opt/conda/envs/torch2/lib/python3.8/site-packages/xformers/ops/fmha/init.py", line 192, in memory_efficient_attention return _memory_efficient_attention( File "/opt/conda/envs/torch2/lib/python3.8/site-packages/xformers/ops/fmha/init.py", line 295, in _memory_efficient_attention return _fMHA.apply( File "/opt/conda/envs/torch2/lib/python3.8/site-packages/torch/autograd/function.py", line 506, in apply return super().apply(*args, **kwargs) # type: ignore[misc] File "/opt/conda/envs/torch2/lib/python3.8/site-packages/xformers/ops/fmha/init.py", line 41, in forward out, op_ctx = _memory_efficient_attention_forward_requires_grad( File "/opt/conda/envs/torch2/lib/python3.8/site-packages/xformers/ops/fmha/init.py", line 320, in _memory_efficient_attention_forward_requires_grad op = _dispatch_fw(inp) File "/opt/conda/envs/torch2/lib/python3.8/site-packages/xformers/ops/fmha/dispatch.py", line 104, in _dispatch_fw return _run_priority_list( File "/opt/conda/envs/torch2/lib/python3.8/site-packages/xformers/ops/fmha/dispatch.py", line 79, in _run_priority_list raise NotImplementedError(msg) NotImplementedError: No operator found for memory_efficient_attention_forward with inputs: query : shape=(1, 197, 12, 64) (torch.float32) key : shape=(1, 197, 12, 64) (torch.float32) value : shape=(1, 197, 12, 64) (torch.float32) attn_bias : <class 'NoneType'> p : 0.0 cutlassF is not supported because: device=cpu (supported: {'cuda'}) flshattF is not supported because: device=cpu (supported: {'cuda'}) dtype=torch.float32 (supported: {torch.bfloat16, torch.float16}) tritonflashattF is not supported because: device=cpu (supported: {'cuda'}) dtype=torch.float32 (supported: {torch.bfloat16, torch.float16}) smallkF is not supported because: max(query.shape[-1] != value.shape[-1]) > 32 has custom scale

To Reproduce

The command you executed.

import torch
from eva_clip import create_model_and_transforms, get_tokenizer
from PIL import Image

model_name = "EVA02-CLIP-B-16" 
pretrained = "eva_clip" # or "/path/to/EVA02_CLIP_B_psz16_s8B.pt"

image_path = "CLIP.png"
caption = ["a diagram", "a dog", "a cat"]

device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = create_model_and_transforms(model_name, pretrained, force_custom_clip=True)
tokenizer = get_tokenizer(model_name)
model = model.to(device)

image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
text = tokenizer(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[0.8275, 0.1372, 0.0352]]

Post related information

PyTorch: 2.0.1+cu116

Additional context [here] Add any other context about the problem here.

[here]

Jun 14 '23 07:06 PancakeAwesome