colpali icon indicating copy to clipboard operation
colpali copied to clipboard

[bug?] No such file or directory: 'checkpoints/checkpoint-1/pytorch_model.bin'

Open groklab opened this issue 8 months ago • 1 comments

Hi ColPali team - thanks a lot for your work & code. When I fine tune the colqwen model with load_best_model_at_end=True, it throws error FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'

(The thing which really frustrates me is that my code did work a few weeks ago on older {colpali/transformer/etc.} versions but now I simply cannot locate back that venv.. )

===

My current code is:

import os

GPU_IDX = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = GPU_IDX

from pathlib import Path
import torch
from colpali_engine.collators.visual_retriever_collator import VisualRetrieverCollator
from colpali_engine.loss import ColbertPairwiseCELoss

from colpali_engine.models import ColQwen2, ColQwen2Processor

from colpali_engine.trainer.contrastive_trainer import ContrastiveTrainer
from colpali_engine.utils.torch_utils import get_torch_device, tear_down_torch
from datasets import load_dataset
from torch import nn
from transformers import BitsAndBytesConfig, TrainingArguments

best_model_dir = Path(f"colqwen2-ft-gpu{GPU_IDX}")
best_model_dir.mkdir(exist_ok=True, parents=True)

device = get_torch_device("auto")

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )


model_name = "vidore/colqwen2-v1.0"

model = ColQwen2.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map=device,
    )


for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True
        


processor = ColQwen2Processor.from_pretrained(model_name)
collator = VisualRetrieverCollator(processor=processor)

from datasets import load_from_disk

ds = load_from_disk('../colpali_fine_tuning_dataset').train_test_split(test_size=0.2, seed=42)
ds["test"] = ds["test"].select(range(5))
ds["train"] = ds["train"].shuffle(seed=42)

checkpoints_dir = Path(f"checkpoints-gpu{GPU_IDX}")
checkpoints_dir.mkdir(exist_ok=True, parents=True)

training_args = TrainingArguments(
    output_dir=str(checkpoints_dir),
    overwrite_output_dir=True,
    num_train_epochs=0.01,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,
    eval_strategy="steps",
    save_steps=1,
    logging_steps=1,
    eval_steps=1,
    warmup_steps=1,
    learning_rate=5e-5,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


trainer = ContrastiveTrainer(
    model=model,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    args=training_args,
    data_collator=collator,
    loss_func=ColbertPairwiseCELoss(),
    is_vision_model=True,
)

trainer.args.remove_unused_columns = False
train_results = trainer.train()

And it throws errors:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 96
     85 trainer = ContrastiveTrainer(
     86     model=model,
     87     train_dataset=ds["train"],
   (...)     92     is_vision_model=True,
     93 )
     95 trainer.args.remove_unused_columns = False
---> 96 train_results = trainer.train()

File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2164, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   2162         hf_hub_utils.enable_progress_bars()
   2163 else:
-> 2164     return inner_training_loop(
   2165         args=args,
   2166         resume_from_checkpoint=resume_from_checkpoint,
   2167         trial=trial,
   2168         ignore_keys_for_eval=ignore_keys_for_eval,
   2169     )

File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2646, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2643     elif is_sagemaker_mp_enabled():
   2644         smp.barrier()
-> 2646     self._load_best_model()
   2648 # add remaining tr_loss
   2649 self._total_loss_scalar += tr_loss.item()

File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2963, in Trainer._load_best_model(self)
   2961     state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
   2962 else:
-> 2963     state_dict = torch.load(
   2964         best_model_path,
   2965         map_location="cpu",
   2966         **weights_only_kwarg,
   2967     )
   2969 # If the model is on the GPU, it still works!
   2970 # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
   2971 # which takes *args instead of **kwargs
   2972 load_result = model.load_state_dict(state_dict, False)

File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:1425, in load(f, map_location, pickle_module, weights_only, mmap, **pickle_load_args)
   1422 if "encoding" not in pickle_load_args.keys():
   1423     pickle_load_args["encoding"] = "utf-8"
-> 1425 with _open_file_like(f, "rb") as opened_file:
   1426     if _is_zipfile(opened_file):
   1427         # The zipfile reader is going to advance the current file position.
   1428         # If we want to actually tail call to torch.jit.load, we need to
   1429         # reset back to the original position.
   1430         orig_position = opened_file.tell()

File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:751, in _open_file_like(name_or_buffer, mode)
    749 def _open_file_like(name_or_buffer, mode):
    750     if _is_path(name_or_buffer):
--> 751         return _open_file(name_or_buffer, mode)
    752     else:
    753         if "w" in mode:

File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:732, in _open_file.__init__(self, name, mode)
    731 def __init__(self, name, mode):
--> 732     super().__init__(open(name, mode))

FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'

Can you advise what's going wrong & am I missing something?

Thank you.

groklab avatar Apr 23 '25 22:04 groklab

hmmm - are you saving checkpoints there ?

ManuelFay avatar May 26 '25 16:05 ManuelFay

Not sure what's happening, I 'll close if you don't answer, I can'ty reproduce

ManuelFay avatar Jun 05 '25 14:06 ManuelFay