Anyone encounter loss nan?
It is ok for small scale models like MLP/LeNet5. However, when it comes to vgg16/ resnet18, it will always produce nan loss. The model structure configuration is below: `cfg = { 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], }
class VGG_CIFAR10_BAY(nn.Module): kl_list = [] def init(self, vgg_name): super(VGG_CIFAR10_BAY, self).init() self.features = self._make_layers(cfg[vgg_name]) linear_index = BayesianLayer.LinearGroupNJ(512, 10, clip_var=0.04, cuda=True) self.classifier = linear_index self.kl_list.append(linear_index)
def forward(self, x):
out = self.features(x)
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out
def _make_layers(self, cfg):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
conv_index = BayesianLayer.Conv2dGroupNJ(in_channels, x, kernel_size=3, padding=1, clip_var=0.04, cuda=True)
layers += [conv_index,
nn.BatchNorm2d(x),
nn.ReLU(inplace=True)]
self.kl_list.append(conv_index)
in_channels = x
layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)
def get_masks(self,thresholds):
# import pdb
# pdb.set_trace()
weight_masks = []
mask = None
layers = self.kl_list
for i, (layer, threshold) in enumerate(zip(layers, thresholds)):
# compute dropout mask
if len(layer.weight_mu.shape) > 2:
if mask is None:
mask = [True]*layer.in_channels
else:
mask = np.copy(next_mask)
log_alpha = layers[i].get_log_dropout_rates().cpu().data.numpy()
next_mask = log_alpha <= thresholds[i]
weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
weight_mask = weight_mask[:,:,None,None]
else:
if mask is None:
log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
mask = log_alpha <= threshold
elif len(weight_mask.shape) > 2:
temp = next_mask.repeat(layer.in_features/next_mask.shape[0])
log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
mask = log_alpha <= threshold
#mask = mask | temp ##Upper bound for number of weights at first fully connected layer
mask = mask & temp ##Lower bound for number of weights at fully connected layer
else:
mask = np.copy(next_mask)
try:
log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy()
next_mask = log_alpha <= thresholds[i + 1]
except:
# must be the last mask
next_mask = np.ones(10)
weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
weight_masks.append(weight_mask.astype(np.float))
return weight_masks
def model_kl_div(self):
KLD = 0
for layer in self.kl_list:
KLD += layer.layer_kl_div()
return KLD`
Does it cause by high variance? But I have tried to clip variance, it doesn't work...