super-gradients icon indicating copy to clipboard operation
super-gradients copied to clipboard

Using DetectionMetricsDistanceBased as a metric throws error - KeyError: '[email protected]'

Open icaroryan opened this issue 11 months ago • 5 comments

🐛 Describe the bug

I wanted to add DetectionMetricsDistanceBased as one of the metrics when training, but then it throws saying KeyError: '[email protected]'

Here's the relevant part of my code, I wonder if I messed something up. I couldn't find any other examples of this, so I went with my gut. Let me know if more parts of my code is needed

I think it's worth to say the end goal here is to improve small object detection

Training Parameters

from super_gradients.training.losses import PPYoloELoss
from super_gradients.training.metrics import (
    DetectionMetrics_050,
    DetectionMetrics_050_095,
)

from super_gradients.training.metrics.detection_metrics import (
    DetectionMetricsDistanceBased
)

from super_gradients.training.models.detection_models.pp_yolo_e import PPYoloEPostPredictionCallback

early_stop = EarlyStop(
    Phase.VALIDATION_EPOCH_END,
    monitor = "[email protected]",
    mode = "max",
    min_delta = 0.001,
    patience = 20,
    verbose = True,
)

train_params = {
    'silent_mode': False,
    "average_best_models":True,
    "warmup_mode": "linear_epoch_step",
    "warmup_initial_lr": 1e-6,
    "lr_warmup_epochs": 4,
    "initial_lr": 2e-4,
    "lr_mode": "cosine",
    "cosine_final_lr_ratio": 0.1,
    "optimizer": "AdamW",
    "optimizer_params": {"weight_decay": 0.0001},
    "zero_weight_decay_on_bias_and_bn": True,
    "ema": True,
    "batch_accumulate": 1,
    "ema_params": {"decay": 0.9, "decay_type": "threshold"},
    "max_epochs": EPOCHS,
    "mixed_precision": True,
    "loss": PPYoloELoss(
        use_static_assigner=False,
        num_classes=len(dataset_params['classes'])
    ),
    "valid_metrics_list": [
        DetectionMetrics_050(
            score_thres=0.1,
            top_k_predictions=30,
            num_cls=len(dataset_params['classes']),
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=30,
                nms_threshold=0.7
            )
        ),
        DetectionMetrics_050_095(
            score_thres=0.1,
            top_k_predictions=30,
            num_cls=len(dataset_params['classes']),
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=30,
                nms_threshold=0.7
            )
        ),
        DetectionMetricsDistanceBased(
            num_cls=len(dataset_params['classes']),
            post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=30,
                nms_threshold=0.7
            ),
        )
    ],
    "metric_to_watch": '[email protected]',
    "phase_callbacks": [early_stop],
    "resume": False
}

# ...

trainer.train(
        model=model, 
        training_params=train_params, 
        train_loader=train_data, 
        valid_loader=val_data
    )

Error

Train epoch 0: 100%|██████████| 4939/4939 [34:51<00:00,  2.36it/s, PPYoloELoss/loss=1.43, PPYoloELoss/loss_cls=0.818, PPYoloELoss/loss_dfl=0.318, PPYoloELoss/loss_iou=0.295, gpu_mem=41.2]
Validating: 100%|██████████| 431/431 [01:23<00:00,  5.14it/s]
[2024-03-11 19:43:24] INFO - early_stopping.py - Metric [email protected] improved. New best score: 0.248
[2024-03-11 19:43:24] INFO - base_sg_logger.py - [CLEANUP] - Successfully stopped system monitoring process
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[59], line 16
      2 trainer = Trainer(
      3     experiment_name=model_to_train+'_ball_aug_'+str(EPOCHS)+'e', 
      4     ckpt_root_dir=CHECKPOINT_DIR
      5 )
      7 model = models.get(
      8     model_to_train, 
      9     num_classes=len(dataset_params['classes']), 
   (...)
     12     # arch_params=arch_params
     13 )
---> 16 trainer.train(
     17     model=model, 
     18     training_params=train_params, 
     19     train_loader=train_data, 
     20     valid_loader=val_data
     21 )

File ~/.local/lib/python3.9/site-packages/super_gradients/training/sg_trainer/sg_trainer.py:1604, in Trainer.train(self, model, training_params, train_loader, valid_loader, test_loaders, additional_configs_to_log)
   1602     # SAVING AND LOGGING OCCURS ONLY IN THE MAIN PROCESS (IN CASES THERE ARE SEVERAL PROCESSES - DDP)
   1603     if should_run_validation and self.training_params.save_model:
-> 1604         self._save_checkpoint(
   1605             optimizer=self.optimizer,
   1606             epoch=1 + epoch,
   1607             train_metrics_dict=train_metrics_dict,
   1608             validation_results_dict=valid_metrics_dict,
   1609             context=context,
   1610         )
   1611     self.sg_logger.upload()
   1613 if not silent_mode:

File ~/.local/lib/python3.9/site-packages/super_gradients/training/sg_trainer/sg_trainer.py:670, in Trainer._save_checkpoint(self, optimizer, epoch, train_metrics_dict, validation_results_dict, context)
    665 # create metrics dict to save
    666 valid_metrics_titles = get_metrics_titles(self.valid_metrics)
    668 all_metrics = {
    669     "tracked_metric_name": self.metric_to_watch,
--> 670     "valid": {metric_name: float(validation_results_dict[metric_name]) for metric_name in valid_metrics_titles},
    671 }
    673 if train_metrics_dict is not None:
    674     train_metrics_titles = get_metrics_titles(self.train_metrics)

File ~/.local/lib/python3.9/site-packages/super_gradients/training/sg_trainer/sg_trainer.py:670, in <dictcomp>(.0)
    665 # create metrics dict to save
    666 valid_metrics_titles = get_metrics_titles(self.valid_metrics)
    668 all_metrics = {
    669     "tracked_metric_name": self.metric_to_watch,
--> 670     "valid": {metric_name: float(validation_results_dict[metric_name]) for metric_name in valid_metrics_titles},
    671 }
    673 if train_metrics_dict is not None:
    674     train_metrics_titles = get_metrics_titles(self.train_metrics)

KeyError: '[email protected]'

Versions

Super_gradients 3.6.0

icaroryan avatar Mar 12 '24 00:03 icaroryan