CLIP How to restart the training from checkpoint?

Hi, I want to train the model with my dataset. But there is a little question , would you please help me. How to restart the training from checkpoint? How to import the checkpoint of CLIP model to calculate cosine similarity ?

Jul 27 '22 01:07 snow1929

model.load_state_dict("checkpoint_file_name.pt", map_location=device)

You can replace device with "cuda" or "cpu"

For more info on loading checkpoints look into torch documentation: Load checkpoints

Jul 27 '22 07:07 sarveshwar-s

Hi, @sarveshwar-s @vinson2233 I try to train the CLIP model by my datasets. With the code [CLIP Training Code](https://github.com/openai/CLIP/issues/83)

import numpy as np
import torch,os
from pkg_resources import packaging
import clip
from torch.utils.data import DataLoader
from torch import nn
from torch import optim
from PIL import Image

BATCH_SIZE = 100
EPOCH = 5000 

print("Torch version:", torch.__version__)
print(torch.cuda.is_available())
print(clip.available_models())
# If using GPU then use mixed precision training.
device = "cuda:0" if torch.cuda.is_available() else "cpu" 

#Must set jit=False for training
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) 

#checkpoint = torch.load("./ViT-B-32.pt")
#model.load_state_dict(checkpoint['model_state_dict'])

#Define Function
class image_title_dataset():
    def __init__(self, list_image_path,list_txt):
        self.list_txt = list_txt
        self.image_path = list_image_path
        self.title  = clip.tokenize(list_txt)
        #you can tokenize everything at once in here(slow at the beginning), or tokenize it in the training loop.

    def __len__(self):
        return len(self.list_txt)

    def __getitem__(self, idx):
        image = preprocess(Image.open(self.image_path[idx])) # Image from PIL module
        title = self.title[idx]
        return image,title


# use my dataset
list_image_path = []
list_txt = []
Path = "./image/"
list_dir = os.listdir(Path)
for subdir in list_dir:
    Path_subdir = Path + subdir + '/'
    list_file= os.listdir(Path_subdir)
    for i in range(len(list_file)):
        list_image_path.append(Path_subdir + list_file[i])
        list_txt.append(subdir)



print("len[list_image_path] = ",len(list_image_path))    
print("len[list_txt] = ",list_txt[0],len(list_txt))

dataset = image_title_dataset(list_image_path,list_txt)
train_dataloader = DataLoader(dataset,batch_size = BATCH_SIZE) #Define your own dataloader

#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

if device == "cpu":
  model.float()
else :
  clip.model.convert_weights(model) # Actually this line is unnecessary since clip by default already on float16

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset

# add your own code to track the training progress.


for epoch in range(EPOCH):

  for batch in train_dataloader :
      optimizer.zero_grad()
      images,texts = batch 
    
      images= images.to(device)
      texts = texts.to(device)
    
      logits_per_image, logits_per_text = model(images, texts)

      ground_truth = torch.arange(len(images),dtype=torch.long,device=device)

      total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
      total_loss.backward()

      if device == "cpu":
         optimizer.step()
      else : 
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)
  if (epoch%100==0):
      print("[",epoch,"]\t total_loss = ", total_loss)

torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
        }, f"model_checkpoint/model_clip_test.pt")
        #just change to your preferred folder/filename

In the training of my code , the total_loss is always the same . But I have no idea what's wrong with it. column [epoch] total_loss

Aug 01 '22 06:08 snow1929