Issues with Model Performance on Smaller Roboflow Datsets
Hey @Peterande ! First of all, thank you so much for this interesting contribution.
We at Roboflow are trying to benchmark D-FINE's performance on smaller roboflow datasets, like those in Roboflow 100 and we are seeing sub 5% mAP as well as crashes for each dataset that we try, both when using the o365 weights and without.
We're using the finetuning config you have defined here and simply replacing the dataset link with a Roboflow dataset. We want to make sure we represent your work in the best light, so do you have any advice on how to get better metrics here? We notice that you recommended trying training from scratch for all the other people encountering similar issues, but I didn't see anyone respond that that worked for them, and we haven't seen it help much for us either.
Here's how we've been launching the benchmark. Please let us know if we're doing something obviously wrong!
import roboflow
import os
import json
import subprocess
import torch
import fire
import re
model_name_to_config_map = {
"dfine_s": "configs/dfine/custom/objects365/dfine_hgnetv2_s_obj2custom.yml",
"dfine_m": "configs/dfine/custom/objects365/dfine_hgnetv2_m_obj2custom.yml",
"dfine_l": "configs/dfine/custom/objects365/dfine_hgnetv2_l_obj2custom.yml",
"dfine_x": "configs/dfine/custom/objects365/dfine_hgnetv2_x_obj2custom.yml",
}
dataset_config_template = "configs/dataset/custom_detection.yml"
generated_model_config_base_dir = "configs/dfine/custom/roboflow"
generated_dataset_config_base_dir = "configs/dataset/roboflow"
model_name_to_o365_checkpoint_map = {
"dfine_s": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_s_obj365.pth",
"dfine_m": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_m_obj365.pth",
"dfine_l": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_l_obj365.pth",
"dfine_x": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_x_obj365.pth",
}
def train_on_roboflow_url(roboflow_url, model_name="dfine_s", output_dir="./output"):
# load dataset and related info
print(f"Downloading dataset from {roboflow_url}")
dataset = roboflow.download_dataset(roboflow_url, "coco")
dataset_train_image_folder = os.path.join(dataset.location, "train")
dataset_train_annotation_file = os.path.join(dataset_train_image_folder, "_annotations.coco.json")
dataset_val_image_folder = os.path.join(dataset.location, "valid")
with open(dataset_train_annotation_file, "r") as f:
train_annotations = json.load(f)
num_classes = len(train_annotations["categories"])
del train_annotations
# construct dataset config
print(f"Creating dataset config in {output_dir}")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(generated_dataset_config_base_dir, exist_ok=True)
os.makedirs(generated_model_config_base_dir, exist_ok=True)
with open(dataset_config_template, "r") as f:
dataset_config = f.read()
dataset_config = dataset_config.replace("num_classes: 777 # your dataset classes", f"num_classes: {num_classes}")
dataset_config = dataset_config.replace("/data/yourdataset/train", dataset_train_image_folder)
dataset_config = dataset_config.replace("train.json", "_annotations.coco.json")
dataset_config = dataset_config.replace("/data/yourdataset/val", dataset_val_image_folder)
dataset_config = dataset_config.replace("val.json", "_annotations.coco.json")
dataset_filename = f"{dataset.name}_dfine_dataset_config.yml"
dataset_config_save_name = os.path.join(generated_dataset_config_base_dir, dataset_filename)
with open(dataset_config_save_name, "w") as f:
f.write(dataset_config)
# construct model config
print(f"Creating model config in {output_dir}")
model_config = model_name_to_config_map[model_name]
with open(model_config, "r") as f:
model_config = f.read()
# ensure the standardized bs and epochs
# model_config = model_config.replace("epochs: 64", "epochs: 1")
epochs = re.search(r"epochs: (\d+)", model_config).group(1)
model_config = model_config.replace(f"epochs: {epochs}", "epochs: 100")
# model_config = model_config.replace("total_batch_size: 128", "total_batch_size: 16")
# model_config = model_config.replace("total_batch_size: 256", "total_batch_size: 16")
model_config = model_config.replace("train_dataloader:", "train_dataloader:\n total_batch_size: 16")
epoch = re.search(r"epoch: (\d+)", model_config).group(1)
model_config = model_config.replace(f"epoch: {epoch}", "epoch: 90")
stop_epoch = re.search(r"stop_epoch: (\d+)", model_config).group(1)
model_config = model_config.replace(f"stop_epoch: {stop_epoch}", "stop_epoch: 90")
model_config = model_config.replace("dataset/custom_detection.yml", os.path.join("..", dataset_config_save_name))
# model_config = model_config.replace("\'../", "\'../../") # since the model config is in an extra subdir
train_output_dir = os.path.join(output_dir, f"{dataset.name}_{model_name}_train_output")
model_size = model_name.split("_")[1]
model_config = model_config.replace(f"output_dir: ./output/dfine_hgnetv2_{model_size}_obj2custom", f"output_dir: {train_output_dir}")
model_config_save_name = os.path.join(generated_model_config_base_dir, f"{dataset.name}_{model_name}_model_config.yml")
with open(model_config_save_name, "w") as f:
f.write(model_config)
# train model
o365_checkpoint_url = model_name_to_o365_checkpoint_map[model_name]
o365_checkpoint_name = o365_checkpoint_url.split("/")[-1]
o365_checkpoint_path = os.path.join(output_dir, o365_checkpoint_name)
if not os.path.exists(o365_checkpoint_path):
print(f"Downloading O365 checkpoint from {o365_checkpoint_url}")
subprocess.run(["wget", o365_checkpoint_url, "-O", o365_checkpoint_path])
print(f"Training model in {train_output_dir}")
num_gpus = torch.cuda.device_count()
train_result = subprocess.run([
"torchrun",
"--nproc_per_node", str(num_gpus),
"--rdzv_endpoint", "localhost:0",
"--rdzv-backend", "c10d",
"train.py",
"-c", model_config_save_name,
"--use-amp",
"--seed=0",
"-t", o365_checkpoint_path
])
# get test set performance
dataset_config = dataset_config.replace("valid", "test")
with open(dataset_config_save_name, "w") as f:
f.write(dataset_config)
stg1_checkpoint_path = os.path.join(train_output_dir, "best_stg1.pth")
stg2_checkpoint_path = os.path.join(train_output_dir, "best_stg2.pth")
if os.path.exists(stg2_checkpoint_path):
print(f"Testing with STG2 checkpoint {stg2_checkpoint_path}")
checkpoint_path = stg2_checkpoint_path
elif os.path.exists(stg1_checkpoint_path):
print(f"Testing with STG1 checkpoint {stg1_checkpoint_path}")
checkpoint_path = stg1_checkpoint_path
else:
raise ValueError(f"No checkpoint found in {train_output_dir}")
test_result = subprocess.run([
"torchrun",
"--nproc_per_node", str(num_gpus),
"--rdzv_endpoint", "localhost:0",
"--rdzv-backend", "c10d",
"train.py",
"-c", model_config_save_name,
"--test-only",
"-r", checkpoint_path
])
test_stats_pth = os.path.join(train_output_dir, "test_stats.pth")
test_stats = torch.load(test_stats_pth, weights_only=False)
results_json = {
"model_name": model_name,
"map": test_stats["coco_eval_bbox"][0],
"map50": test_stats["coco_eval_bbox"][1],
"url": roboflow_url,
}
results_json_pth = os.path.join(train_output_dir, "results.json")
with open(results_json_pth, "w") as f:
json.dump(results_json, f, indent=2)
if __name__ == "__main__":
fire.Fire(train_on_roboflow_url)
Thank you for your attention. Could you please provide the training logs with a low AP result?
Hi! I'll follow up with logs by the end of the week. Apologies for the delay, taking some time off to get some rest after the sprint to ship RF-DETR :)
following up with the logs! please find them attached. on our test dataset we converged to <2 mAP
additionally, here are the YAMLs we used. everything else in the repo was untouched
task: detection
evaluator:
type: CocoEvaluator
iou_types: ['bbox', ]
num_classes: 8
remap_mscoco_category: False
train_dataloader:
type: DataLoader
dataset:
type: CocoDetection
img_folder: /home/isaac/D-FINE/aquarium-combined-gjvb-1/train
ann_file: /home/isaac/D-FINE/aquarium-combined-gjvb-1/train/_annotations.coco.json
return_masks: False
transforms:
type: Compose
ops: ~
shuffle: True
num_workers: 4
drop_last: True
collate_fn:
type: BatchImageCollateFunction
val_dataloader:
type: DataLoader
dataset:
type: CocoDetection
img_folder: /home/isaac/D-FINE/aquarium-combined-gjvb-1/valid
ann_file: /home/isaac/D-FINE/aquarium-combined-gjvb-1/valid/_annotations.coco.json
return_masks: False
transforms:
type: Compose
ops: ~
shuffle: False
num_workers: 4
drop_last: False
collate_fn:
type: BatchImageCollateFunction
__include__: [
'../../../../configs/dataset/roboflow/aquarium-combined-gjvb_dfine_dataset_config.yml',
'../../../runtime.yml',
'../../include/dataloader.yml',
'../../include/optimizer.yml',
'../../include/dfine_hgnetv2.yml',
]
output_dir: ./output/aquarium-combined-gjvb_dfine_s_train_output
DFINE:
backbone: HGNetv2
HGNetv2:
name: 'B0'
return_idx: [1, 2, 3]
freeze_at: -1
freeze_norm: False
use_lab: True
pretrained: False
DFINETransformer:
num_layers: 3 # 4 5 6
eval_idx: -1 # -2 -3 -4
HybridEncoder:
in_channels: [256, 512, 1024]
hidden_dim: 256
depth_mult: 0.34
expansion: 0.5
optimizer:
type: AdamW
params:
-
params: '^(?=.*backbone)(?!.*norm|bn).*$'
lr: 0.000125
-
params: '^(?=.*backbone)(?=.*norm|bn).*$'
lr: 0.000125
weight_decay: 0.
-
params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
weight_decay: 0.
lr: 0.00025
betas: [0.9, 0.999]
weight_decay: 0.000125
epochs: 64 # Early stop
train_dataloader:
dataset:
transforms:
policy:
epoch: 56
collate_fn:
stop_epoch: 56
ema_restart_decay: 0.9999
base_size_repeat: 10
ema:
warmups: 0
lr_warmup_scheduler:
warmup_duration: 0
we used a dataset hosted on Roboflow of a visit to an aquarium, downloaded via:
roboflow.download_dataset(<INSERT URL HERE>, "coco")
which downloads it in COCO format
the specific finetune command we ran was:
torchrun --nproc_per_node 1 --rdvz_endpoint localhost:0 --rdvz-backend c10d train.py -c <MODEL_CONFIG_YAML> --use-amp --seed=0 -t <OBJECTS365_CHECKPOINT>
any update on this? we would really love to make sure we're doing your method justice
The pre-trained model of Objects365, when used to fine-tune models with very small amounts of data, can indeed experience significant performance degradation or, in the worst case, fail to converge. This is because the feature representation space of Objects365 is mostly reserved for distinguishing complex object categories rather than just detection. When fine-tuning on your dataset, the number of categories and total samples are both very small, making it highly prone to overfitting. Could you please provide specific results when using pre-trained weights versus not using them? Also, have you tried fine-tuning with the stage 1 weights pretrained on COCO?
As noted in the original post, we tried both from scratch and from o365. I'll follow up with specific logs.