SoccerNetv2-DevKit
SoccerNetv2-DevKit copied to clipboard
Getting Training and validation loss NaN sometimes while training.

More info?
The training script runs and calculates the loss correctly, but sometimes when the same script is run without any changes, the loss becomes "NaN."
Hi @skfaysal,
We would need a bit more information like which baseline (CALD, NetVLAD++, etc ) you are trying to run and which command line you are using. Thank you.
Hi @cioppaanthony ,
I'm running NetVLAD++ baseline by loading weights for action spotting. The command i'm using to run the script is:
python3 src/main.py --SoccerNet_path=/root/training_data/ --load_weights=NetVLAD++_run_4 --model_name=NetVLAD++_run_ --batch_size 256
Here is the main script. I have fine tuned the model by changing the output node for my reduced dataset which has 4 output class.
import os
import logging
from datetime import datetime
import time
import numpy as np
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import torch
import torch.nn as nn
import importlib
from dataset import SoccerNetClips, SoccerNetClipsTesting #,SoccerNetClipsOld
from model import Model
from train import trainer, test, testSpotting
from loss import NLLLoss
from inspect import currentframe, getframeinfo
def main(args):
# mlflow.pytorch.autolog()
logging.info("Parameters:")
for arg in vars(args):
logging.info(arg.rjust(15) + " : " + str(getattr(args, arg)))
# create dataset
if not args.test_only:
print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
dataset_Train = SoccerNetClips(path=args.SoccerNet_path, features=args.features, split=args.split_train, version=args.version, framerate=args.framerate, window_size=args.window_size)
print('\n')
dataset_Valid = SoccerNetClips(path=args.SoccerNet_path, features=args.features, split=args.split_valid, version=args.version, framerate=args.framerate, window_size=args.window_size)
print('\n')
dataset_Valid_metric = SoccerNetClips(path=args.SoccerNet_path, features=args.features, split=args.split_valid, version=args.version, framerate=args.framerate, window_size=args.window_size)
print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
dataset_Test = SoccerNetClipsTesting(path=args.SoccerNet_path, features=args.features, split=args.split_test, version=args.version, framerate=args.framerate, window_size=args.window_size)
if args.feature_dim is None:
print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
args.feature_dim = dataset_Test[0][1].shape[-1]
# print("feature_dim found:", args.feature_dim)
# create model
print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
# print("\n##### dataset_Test.num_classes: ",dataset_Test.num_classes)
if not args.test_only:
model = Model(input_size=args.feature_dim,
num_classes= 17, window_size=args.window_size,
vocab_size = args.vocab_size,
framerate=args.framerate, pool=args.pool).cuda() # num_classes=17
else:
model = Model(input_size=args.feature_dim,
num_classes= dataset_Test.num_classes, window_size=args.window_size,
vocab_size = args.vocab_size,
framerate=args.framerate, pool=args.pool).cuda() # num_classes=17
if not args.test_only:
for param in model.parameters():
param.requires_grad = False
if not args.test_only:
checkpoint = torch.load(os.path.join("models", args.load_weights, "model.pth.tar"))
model.load_state_dict(checkpoint['state_dict'])
print("\n\n=> loaded checkpoint '{}' (epoch {})"
.format(args.load_weights, checkpoint['epoch']))
logging.info(model)
print("\n[info] Model Parameters before changing fc: ",model.parameters)
# total_params = sum(p.numel()
# for p in model.parameters() if p.requires_grad)
# parameters_per_layer = [p.numel() for p in model.parameters() if p.requires_grad]
# logging.info("Total number of parameters: " + str(total_params))
print("\n\n######## After change fc layer:")
num_ftrs = model.fc.in_features
# print("\n### num_ftrs:",num_ftrs)
# Here the size of each output sample is set to 4.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model.fc = nn.Linear(num_ftrs, 5).cuda()
print("\n[info] Model Parameters: ",model.parameters)
# total_params = sum(p.numel()
# for p in model.parameters() if p.requires_grad)
# parameters_per_layer = [p.numel() for p in model.parameters() if p.requires_grad]
# logging.info("Total number of parameters: " + str(total_params))
# create dataloader
if not args.test_only:
print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
train_loader = torch.utils.data.DataLoader(dataset_Train,
batch_size=args.batch_size, shuffle=True,
num_workers=args.max_num_worker, pin_memory=True)
val_loader = torch.utils.data.DataLoader(dataset_Valid,
batch_size=args.batch_size, shuffle=False,
num_workers=args.max_num_worker, pin_memory=True)
val_metric_loader = torch.utils.data.DataLoader(dataset_Valid_metric,
batch_size=args.batch_size, shuffle=False,
num_workers=args.max_num_worker, pin_memory=True)
# Display image and label.
train_features, train_labels = next(iter(train_loader))
print(f"\n\nFeature batch shape: {train_features.size()}")
print(f"\n\nLabels batch shape: {train_labels.size()}")
# img = train_features[0].squeeze()
# label = train_labels[0]
# print(f"Label: {label}")
# training parameters
if not args.test_only:
print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
criterion = NLLLoss()
print("\n\n Parameters to update: ")
for name,param in model.named_parameters():
if param.requires_grad == True:
print("\t",name)
optimizer = torch.optim.Adam(model.parameters(), lr=args.LR,
betas=(0.9, 0.999), eps=1e-08,
weight_decay=0, amsgrad=False)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=args.patience)
# start training
print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
trainer(train_loader, val_loader, val_metric_loader,
model, optimizer, scheduler, criterion,
model_name=args.model_name,
max_epochs=args.max_epochs, evaluation_frequency=args.evaluation_frequency)
# Free up some RAM memory while training
if not args.test_only:
del dataset_Train, dataset_Valid, dataset_Valid_metric, dataset_Test
del train_loader, val_loader, val_metric_loader
# For the best model only
checkpoint = torch.load(os.path.join("models", args.model_name, "model.pth.tar"))
model.load_state_dict(checkpoint['state_dict'])
for split in args.split_test:
print("\n....[INFO] line {} called".format(getframeinfo(currentframe()).lineno))
dataset_Test = SoccerNetClipsTesting(path=args.SoccerNet_path, features=args.features, split=[split], version=args.version, framerate=args.framerate, window_size=args.window_size)
test_loader = torch.utils.data.DataLoader(dataset_Test,
batch_size=1, shuffle=False,
num_workers=1, pin_memory=True)
results = testSpotting(test_loader, model=model, model_name=args.model_name, NMS_window=args.NMS_window, NMS_threshold=args.NMS_threshold)
if results is None:
continue
a_mAP = results["a_mAP"]
a_mAP_per_class = results["a_mAP_per_class"]
a_mAP_visible = results["a_mAP_visible"]
a_mAP_per_class_visible = results["a_mAP_per_class_visible"]
a_mAP_unshown = results["a_mAP_unshown"]
a_mAP_per_class_unshown = results["a_mAP_per_class_unshown"]
# mlflow.log_metric("a_mAP",a_mAP)
# mlflow.log_metric("a_mAP_per_class",a_mAP_per_class)
logging.info("Best Performance at end of training ")
logging.info("a_mAP visibility all: " + str(a_mAP))
logging.info("a_mAP visibility all per class: " + str( a_mAP_per_class))
logging.info("a_mAP visibility visible: " + str( a_mAP_visible))
logging.info("a_mAP visibility visible per class: " + str( a_mAP_per_class_visible))
logging.info("a_mAP visibility unshown: " + str( a_mAP_unshown))
logging.info("a_mAP visibility unshown per class: " + str( a_mAP_per_class_unshown))
return
if __name__ == '__main__':
parser = ArgumentParser(description='context aware loss function', formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--SoccerNet_path', required=False, type=str, default="/path/to/SoccerNet/", help='Path for SoccerNet' )
parser.add_argument('--features', required=False, type=str, default="ResNET_TF2.npy", help='Video features' )
parser.add_argument('--max_epochs', required=False, type=int, default=1000, help='Maximum number of epochs' )
parser.add_argument('--load_weights', required=False, type=str, default=None, help='weights to load' )
parser.add_argument('--model_name', required=False, type=str, default="NetVLAD++", help='named of the model to save' )
parser.add_argument('--test_only', required=False, action='store_true', help='Perform testing only' )
parser.add_argument('--split_train', nargs='+', default=["train"], help='list of split for training')
parser.add_argument('--split_valid', nargs='+', default=["valid"], help='list of split for validation')
parser.add_argument('--split_test', nargs='+', default=["test"], help='list of split for testing')
parser.add_argument('--version', required=False, type=int, default=2, help='Version of the dataset' )
parser.add_argument('--feature_dim', required=False, type=int, default=None, help='Number of input features' )
parser.add_argument('--evaluation_frequency', required=False, type=int, default=10, help='Number of chunks per epoch' )
parser.add_argument('--framerate', required=False, type=int, default=2, help='Framerate of the input features' )
parser.add_argument('--window_size', required=False, type=int, default=16, help='Size of the chunk (in seconds)' )
parser.add_argument('--pool', required=False, type=str, default="NetVLAD++", help='How to pool' )
parser.add_argument('--vocab_size', required=False, type=int, default=64, help='Size of the vocabulary for NetVLAD' )
parser.add_argument('--NMS_window', required=False, type=int, default=30, help='NMS window in second' )
parser.add_argument('--NMS_threshold', required=False, type=float, default=0.0, help='NMS threshold for positive results' )
parser.add_argument('--batch_size', required=False, type=int, default=256, help='Batch size' )
parser.add_argument('--LR', required=False, type=float, default=1e-03, help='Learning Rate' )
parser.add_argument('--LRe', required=False, type=float, default=1e-06, help='Learning Rate end' )
parser.add_argument('--patience', required=False, type=int, default=30, help='Patience before reducing LR (ReduceLROnPlateau)' )
parser.add_argument('--GPU', required=False, type=int, default=-1, help='ID of the GPU to use' )
parser.add_argument('--max_num_worker', required=False, type=int, default=4, help='number of worker to load data')
parser.add_argument('--seed', required=False, type=int, default=0, help='seed for reproducibility')
# parser.add_argument('--logging_dir', required=False, type=str, default="log", help='Where to log' )
parser.add_argument('--loglevel', required=False, type=str, default='INFO', help='logging level')
args = parser.parse_args()
# for reproducibility
torch.manual_seed(args.seed)
np.random.seed(args.seed)
numeric_level = getattr(logging, args.loglevel.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % args.loglevel)
os.makedirs(os.path.join("models", args.model_name), exist_ok=True)
log_path = os.path.join("models", args.model_name,
datetime.now().strftime('%Y-%m-%d_%H-%M-%S.log'))
logging.basicConfig(
level=numeric_level,
format=
"%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s",
handlers=[
logging.FileHandler(log_path),
logging.StreamHandler()
])
if args.GPU >= 0:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.GPU)
start=time.time()
logging.info('Starting main function')
main(args)
logging.info(f'Total Execution Time is {time.time()-start} seconds')
You are fine-tuning a pre-trained model with a large learning rate and your training diverged, hence the NaN loss.
If you are doing any change on the dataset, the output, the model or the training, you will have to perform hyper parameter tuning. You can't replace the part of the code and expect to use the same training parameters.
In your particular case, try decreasing your learning rate.