SoccerNetv2-DevKit icon indicating copy to clipboard operation
SoccerNetv2-DevKit copied to clipboard

Getting Training and validation loss NaN sometimes while training.

Open skfaysal opened this issue 2 years ago • 6 comments

Screenshot 2023-02-13 at 2 36 23 AM

skfaysal avatar Feb 12 '23 20:02 skfaysal

More info?

SilvioGiancola avatar Feb 13 '23 06:02 SilvioGiancola

The training script runs and calculates the loss correctly, but sometimes when the same script is run without any changes, the loss becomes "NaN."

skfaysal avatar Feb 13 '23 11:02 skfaysal

Hi @skfaysal,

We would need a bit more information like which baseline (CALD, NetVLAD++, etc ) you are trying to run and which command line you are using. Thank you.

cioppaanthony avatar Feb 13 '23 15:02 cioppaanthony

Hi @cioppaanthony , I'm running NetVLAD++ baseline by loading weights for action spotting. The command i'm using to run the script is: python3 src/main.py --SoccerNet_path=/root/training_data/ --load_weights=NetVLAD++_run_4 --model_name=NetVLAD++_run_ --batch_size 256

skfaysal avatar Feb 13 '23 15:02 skfaysal

Here is the main script. I have fine tuned the model by changing the output node for my reduced dataset which has 4 output class.

import os
import logging
from datetime import datetime
import time
import numpy as np
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

import torch
import torch.nn as nn

import importlib


from dataset import SoccerNetClips, SoccerNetClipsTesting #,SoccerNetClipsOld
from model import Model
from train import trainer, test, testSpotting
from loss import NLLLoss

from inspect import currentframe, getframeinfo



def main(args):
    # mlflow.pytorch.autolog()
    logging.info("Parameters:")
    for arg in vars(args):
        logging.info(arg.rjust(15) + " : " + str(getattr(args, arg)))

    # create dataset
    if not args.test_only:
        print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
        dataset_Train = SoccerNetClips(path=args.SoccerNet_path, features=args.features, split=args.split_train, version=args.version, framerate=args.framerate, window_size=args.window_size)
        print('\n')
        dataset_Valid = SoccerNetClips(path=args.SoccerNet_path, features=args.features, split=args.split_valid, version=args.version, framerate=args.framerate, window_size=args.window_size)
        print('\n')
        dataset_Valid_metric  = SoccerNetClips(path=args.SoccerNet_path, features=args.features, split=args.split_valid, version=args.version, framerate=args.framerate, window_size=args.window_size)
    print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
    dataset_Test  = SoccerNetClipsTesting(path=args.SoccerNet_path, features=args.features, split=args.split_test, version=args.version, framerate=args.framerate, window_size=args.window_size)

    if args.feature_dim is None:
        print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
        args.feature_dim = dataset_Test[0][1].shape[-1]
        # print("feature_dim found:", args.feature_dim)
    # create model
    print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
    # print("\n##### dataset_Test.num_classes: ",dataset_Test.num_classes)
    
    
    if not args.test_only:
        model = Model(input_size=args.feature_dim,
                num_classes= 17, window_size=args.window_size, 
                vocab_size = args.vocab_size,
                framerate=args.framerate, pool=args.pool).cuda() # num_classes=17
    else:
        model = Model(input_size=args.feature_dim,
                    num_classes= dataset_Test.num_classes, window_size=args.window_size, 
                    vocab_size = args.vocab_size,
                    framerate=args.framerate, pool=args.pool).cuda() # num_classes=17
    if not args.test_only:

        for param in model.parameters():
            param.requires_grad = False
    
    if not args.test_only:  
        checkpoint = torch.load(os.path.join("models", args.load_weights, "model.pth.tar"))
        model.load_state_dict(checkpoint['state_dict'])
        print("\n\n=> loaded checkpoint '{}' (epoch {})"
                    .format(args.load_weights, checkpoint['epoch']))
        
        logging.info(model)
        print("\n[info] Model Parameters before changing fc: ",model.parameters)
        # total_params = sum(p.numel()
        #                 for p in model.parameters() if p.requires_grad)
        # parameters_per_layer  = [p.numel() for p in model.parameters() if p.requires_grad]
        # logging.info("Total number of parameters: " + str(total_params))

        print("\n\n######## After change fc layer:")
        num_ftrs = model.fc.in_features
        # print("\n### num_ftrs:",num_ftrs)
        # Here the size of each output sample is set to 4.
        # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
        model.fc = nn.Linear(num_ftrs, 5).cuda()

        print("\n[info] Model Parameters: ",model.parameters)
        # total_params = sum(p.numel()
        #                 for p in model.parameters() if p.requires_grad)
        # parameters_per_layer  = [p.numel() for p in model.parameters() if p.requires_grad]
        # logging.info("Total number of parameters: " + str(total_params))



    # create dataloader
    if not args.test_only:
        print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
        train_loader = torch.utils.data.DataLoader(dataset_Train,
            batch_size=args.batch_size, shuffle=True,
            num_workers=args.max_num_worker, pin_memory=True)

        val_loader = torch.utils.data.DataLoader(dataset_Valid,
            batch_size=args.batch_size, shuffle=False,
            num_workers=args.max_num_worker, pin_memory=True)

        val_metric_loader = torch.utils.data.DataLoader(dataset_Valid_metric,
            batch_size=args.batch_size, shuffle=False,
            num_workers=args.max_num_worker, pin_memory=True)

        # Display image and label.
        train_features, train_labels = next(iter(train_loader))
        print(f"\n\nFeature batch shape: {train_features.size()}")
        print(f"\n\nLabels batch shape: {train_labels.size()}")
        # img = train_features[0].squeeze()
        # label = train_labels[0]
        # print(f"Label: {label}")


    # training parameters
    if not args.test_only:
        print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
        criterion = NLLLoss()
        print("\n\n Parameters to update: ")
        for name,param in model.named_parameters():
            if param.requires_grad == True:
                print("\t",name)
        optimizer = torch.optim.Adam(model.parameters(), lr=args.LR, 
                                    betas=(0.9, 0.999), eps=1e-08, 
                                    weight_decay=0, amsgrad=False)


        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=args.patience)

        # start training
        print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
        trainer(train_loader, val_loader, val_metric_loader, 
                model, optimizer, scheduler, criterion,
                model_name=args.model_name,
                max_epochs=args.max_epochs, evaluation_frequency=args.evaluation_frequency)



    # Free up some RAM memory while training
    if not args.test_only:
        del dataset_Train, dataset_Valid, dataset_Valid_metric, dataset_Test
        del train_loader, val_loader, val_metric_loader

    # For the best model only
    checkpoint = torch.load(os.path.join("models", args.model_name, "model.pth.tar"))
    model.load_state_dict(checkpoint['state_dict'])


    for split in args.split_test:
        print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
        dataset_Test  = SoccerNetClipsTesting(path=args.SoccerNet_path, features=args.features, split=[split], version=args.version, framerate=args.framerate, window_size=args.window_size)

        test_loader = torch.utils.data.DataLoader(dataset_Test,
            batch_size=1, shuffle=False,
            num_workers=1, pin_memory=True)

        results = testSpotting(test_loader, model=model, model_name=args.model_name, NMS_window=args.NMS_window, NMS_threshold=args.NMS_threshold)
        if results is None:
            continue

        a_mAP = results["a_mAP"]
        a_mAP_per_class = results["a_mAP_per_class"]
        a_mAP_visible = results["a_mAP_visible"]
        a_mAP_per_class_visible = results["a_mAP_per_class_visible"]
        a_mAP_unshown = results["a_mAP_unshown"]
        a_mAP_per_class_unshown = results["a_mAP_per_class_unshown"]

        # mlflow.log_metric("a_mAP",a_mAP)
        # mlflow.log_metric("a_mAP_per_class",a_mAP_per_class)

        logging.info("Best Performance at end of training ")
        logging.info("a_mAP visibility all: " +  str(a_mAP))
        logging.info("a_mAP visibility all per class: " +  str( a_mAP_per_class))
        logging.info("a_mAP visibility visible: " +  str( a_mAP_visible))
        logging.info("a_mAP visibility visible per class: " +  str( a_mAP_per_class_visible))
        logging.info("a_mAP visibility unshown: " +  str( a_mAP_unshown))
        logging.info("a_mAP visibility unshown per class: " +  str( a_mAP_per_class_unshown))

    return 
    
if __name__ == '__main__':

    parser = ArgumentParser(description='context aware loss function', formatter_class=ArgumentDefaultsHelpFormatter)
    
    parser.add_argument('--SoccerNet_path',   required=False, type=str,   default="/path/to/SoccerNet/",     help='Path for SoccerNet' )
    parser.add_argument('--features',   required=False, type=str,   default="ResNET_TF2.npy",     help='Video features' )
    parser.add_argument('--max_epochs',   required=False, type=int,   default=1000,     help='Maximum number of epochs' )
    parser.add_argument('--load_weights',   required=False, type=str,   default=None,     help='weights to load' )
    parser.add_argument('--model_name',   required=False, type=str,   default="NetVLAD++",     help='named of the model to save' )
    parser.add_argument('--test_only',   required=False, action='store_true',  help='Perform testing only' )

    parser.add_argument('--split_train', nargs='+', default=["train"], help='list of split for training')
    parser.add_argument('--split_valid', nargs='+', default=["valid"], help='list of split for validation')
    parser.add_argument('--split_test', nargs='+', default=["test"], help='list of split for testing')

    parser.add_argument('--version', required=False, type=int,   default=2,     help='Version of the dataset' )
    parser.add_argument('--feature_dim', required=False, type=int,   default=None,     help='Number of input features' )
    parser.add_argument('--evaluation_frequency', required=False, type=int,   default=10,     help='Number of chunks per epoch' )
    parser.add_argument('--framerate', required=False, type=int,   default=2,     help='Framerate of the input features' )
    parser.add_argument('--window_size', required=False, type=int,   default=16,     help='Size of the chunk (in seconds)' )
    parser.add_argument('--pool',       required=False, type=str,   default="NetVLAD++", help='How to pool' )
    parser.add_argument('--vocab_size',       required=False, type=int,   default=64, help='Size of the vocabulary for NetVLAD' )
    parser.add_argument('--NMS_window',       required=False, type=int,   default=30, help='NMS window in second' )
    parser.add_argument('--NMS_threshold',       required=False, type=float,   default=0.0, help='NMS threshold for positive results' )

    parser.add_argument('--batch_size', required=False, type=int,   default=256,     help='Batch size' )
    parser.add_argument('--LR',       required=False, type=float,   default=1e-03, help='Learning Rate' )
    parser.add_argument('--LRe',       required=False, type=float,   default=1e-06, help='Learning Rate end' )
    parser.add_argument('--patience', required=False, type=int,   default=30,     help='Patience before reducing LR (ReduceLROnPlateau)' )

    parser.add_argument('--GPU',        required=False, type=int,   default=-1,     help='ID of the GPU to use' )
    parser.add_argument('--max_num_worker',   required=False, type=int,   default=4, help='number of worker to load data')
    parser.add_argument('--seed',   required=False, type=int,   default=0, help='seed for reproducibility')

    # parser.add_argument('--logging_dir',       required=False, type=str,   default="log", help='Where to log' )
    parser.add_argument('--loglevel',   required=False, type=str,   default='INFO', help='logging level')

    args = parser.parse_args()

    # for reproducibility
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    numeric_level = getattr(logging, args.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % args.loglevel)

    os.makedirs(os.path.join("models", args.model_name), exist_ok=True)
    log_path = os.path.join("models", args.model_name,
                            datetime.now().strftime('%Y-%m-%d_%H-%M-%S.log'))
    logging.basicConfig(
        level=numeric_level,
        format=
        "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s",
        handlers=[
            logging.FileHandler(log_path),
            logging.StreamHandler()
        ])

    if args.GPU >= 0:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.GPU)


    start=time.time()
    logging.info('Starting main function')
    main(args)
    logging.info(f'Total Execution Time is {time.time()-start} seconds')

skfaysal avatar Feb 13 '23 15:02 skfaysal

You are fine-tuning a pre-trained model with a large learning rate and your training diverged, hence the NaN loss.

If you are doing any change on the dataset, the output, the model or the training, you will have to perform hyper parameter tuning. You can't replace the part of the code and expect to use the same training parameters.

In your particular case, try decreasing your learning rate.

SilvioGiancola avatar Feb 13 '23 18:02 SilvioGiancola