CLIP
CLIP copied to clipboard
Context Length Error
If I reduce context length to anything under 77, I get an error:
def generate_clip_embeddings(captions, clip_model, batch_size=32):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model.to(device)
embeddings = []
for batch_start in range(0, len(captions), batch_size):
batch_captions = captions[batch_start:batch_start + batch_size]
batch_inputs = clip.tokenize(batch_captions, context_length=56, truncate=True).to(device)
with torch.no_grad():
batch_outputs = clip_model.encode_text(batch_inputs)
embeddings.append(batch_outputs.cpu().numpy())
return np.vstack(embeddings)
RuntimeError Traceback (most recent call last)
Cell In[37], line 1
----> 1 clip_embeddings_train = generate_clip_embeddings(train_captions, clip_model, batch_size=128)
2 clip_embeddings_val = generate_clip_embeddings(val_captions, clip_model, batch_size=128)
Cell In[36], line 10, in generate_clip_embeddings(captions, clip_model, batch_size)
8 batch_inputs = clip.tokenize(batch_captions, context_length=56, truncate=True).to(device)
9 with torch.no_grad():
---> 10 batch_outputs = clip_model.encode_text(batch_inputs)
11 embeddings.append(batch_outputs.cpu().numpy())
13 return np.vstack(embeddings)
File /opt/conda/lib/python3.10/site-packages/clip/model.py:346, in CLIP.encode_text(self, text)
343 def encode_text(self, text):
344 x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]
--> 346 x = x + self.positional_embedding.type(self.dtype)
347 x = x.permute(1, 0, 2) # NLD -> LND
348 x = self.transformer(x)
RuntimeError: The size of tensor a (56) must match the size of tensor b (77) at non-singleton dimension 1
Any ideas on how to resolve this?