[bug?] No such file or directory: 'checkpoints/checkpoint-1/pytorch_model.bin'
Hi ColPali team - thanks a lot for your work & code. When I fine tune the colqwen model with load_best_model_at_end=True, it throws error FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'
(The thing which really frustrates me is that my code did work a few weeks ago on older {colpali/transformer/etc.} versions but now I simply cannot locate back that venv.. )
===
My current code is:
import os
GPU_IDX = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = GPU_IDX
from pathlib import Path
import torch
from colpali_engine.collators.visual_retriever_collator import VisualRetrieverCollator
from colpali_engine.loss import ColbertPairwiseCELoss
from colpali_engine.models import ColQwen2, ColQwen2Processor
from colpali_engine.trainer.contrastive_trainer import ContrastiveTrainer
from colpali_engine.utils.torch_utils import get_torch_device, tear_down_torch
from datasets import load_dataset
from torch import nn
from transformers import BitsAndBytesConfig, TrainingArguments
best_model_dir = Path(f"colqwen2-ft-gpu{GPU_IDX}")
best_model_dir.mkdir(exist_ok=True, parents=True)
device = get_torch_device("auto")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model_name = "vidore/colqwen2-v1.0"
model = ColQwen2.from_pretrained(
model_name,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
device_map=device,
)
for name, param in model.named_parameters():
if "lora" in name:
param.requires_grad = True
processor = ColQwen2Processor.from_pretrained(model_name)
collator = VisualRetrieverCollator(processor=processor)
from datasets import load_from_disk
ds = load_from_disk('../colpali_fine_tuning_dataset').train_test_split(test_size=0.2, seed=42)
ds["test"] = ds["test"].select(range(5))
ds["train"] = ds["train"].shuffle(seed=42)
checkpoints_dir = Path(f"checkpoints-gpu{GPU_IDX}")
checkpoints_dir.mkdir(exist_ok=True, parents=True)
training_args = TrainingArguments(
output_dir=str(checkpoints_dir),
overwrite_output_dir=True,
num_train_epochs=0.01,
per_device_train_batch_size=5,
per_device_eval_batch_size=5,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
eval_strategy="steps",
save_steps=1,
logging_steps=1,
eval_steps=1,
warmup_steps=1,
learning_rate=5e-5,
save_total_limit=1,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
)
trainer = ContrastiveTrainer(
model=model,
train_dataset=ds["train"],
eval_dataset=ds["test"],
args=training_args,
data_collator=collator,
loss_func=ColbertPairwiseCELoss(),
is_vision_model=True,
)
trainer.args.remove_unused_columns = False
train_results = trainer.train()
And it throws errors:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[1], line 96
85 trainer = ContrastiveTrainer(
86 model=model,
87 train_dataset=ds["train"],
(...) 92 is_vision_model=True,
93 )
95 trainer.args.remove_unused_columns = False
---> 96 train_results = trainer.train()
File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2164, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2162 hf_hub_utils.enable_progress_bars()
2163 else:
-> 2164 return inner_training_loop(
2165 args=args,
2166 resume_from_checkpoint=resume_from_checkpoint,
2167 trial=trial,
2168 ignore_keys_for_eval=ignore_keys_for_eval,
2169 )
File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2646, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2643 elif is_sagemaker_mp_enabled():
2644 smp.barrier()
-> 2646 self._load_best_model()
2648 # add remaining tr_loss
2649 self._total_loss_scalar += tr_loss.item()
File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2963, in Trainer._load_best_model(self)
2961 state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
2962 else:
-> 2963 state_dict = torch.load(
2964 best_model_path,
2965 map_location="cpu",
2966 **weights_only_kwarg,
2967 )
2969 # If the model is on the GPU, it still works!
2970 # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
2971 # which takes *args instead of **kwargs
2972 load_result = model.load_state_dict(state_dict, False)
File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:1425, in load(f, map_location, pickle_module, weights_only, mmap, **pickle_load_args)
1422 if "encoding" not in pickle_load_args.keys():
1423 pickle_load_args["encoding"] = "utf-8"
-> 1425 with _open_file_like(f, "rb") as opened_file:
1426 if _is_zipfile(opened_file):
1427 # The zipfile reader is going to advance the current file position.
1428 # If we want to actually tail call to torch.jit.load, we need to
1429 # reset back to the original position.
1430 orig_position = opened_file.tell()
File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:751, in _open_file_like(name_or_buffer, mode)
749 def _open_file_like(name_or_buffer, mode):
750 if _is_path(name_or_buffer):
--> 751 return _open_file(name_or_buffer, mode)
752 else:
753 if "w" in mode:
File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:732, in _open_file.__init__(self, name, mode)
731 def __init__(self, name, mode):
--> 732 super().__init__(open(name, mode))
FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'
Can you advise what's going wrong & am I missing something?
Thank you.
hmmm - are you saving checkpoints there ?
Not sure what's happening, I 'll close if you don't answer, I can'ty reproduce