Anyone encounter loss nan?

Open kuanzi opened this issue 6 years ago • 0 comments

It is ok for small scale models like MLP/LeNet5. However, when it comes to vgg16/ resnet18, it will always produce nan loss. The model structure configuration is below: `cfg = { 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], }

class VGG_CIFAR10_BAY(nn.Module): kl_list = [] def init(self, vgg_name): super(VGG_CIFAR10_BAY, self).init() self.features = self._make_layers(cfg[vgg_name]) linear_index = BayesianLayer.LinearGroupNJ(512, 10, clip_var=0.04, cuda=True) self.classifier = linear_index self.kl_list.append(linear_index)

def forward(self, x):
    out = self.features(x)
    out = out.view(out.size(0), -1)
    out = self.classifier(out)
    return out

def _make_layers(self, cfg):
    layers = []
    in_channels = 3
    for x in cfg:
        if x == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv_index = BayesianLayer.Conv2dGroupNJ(in_channels, x, kernel_size=3, padding=1, clip_var=0.04, cuda=True)
            layers += [conv_index,
                       nn.BatchNorm2d(x),
                       nn.ReLU(inplace=True)]
            self.kl_list.append(conv_index)
            in_channels = x
    layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
    return nn.Sequential(*layers)

def get_masks(self,thresholds):
    # import pdb
    # pdb.set_trace()
    weight_masks = []
    mask = None
    layers = self.kl_list
    for i, (layer, threshold) in enumerate(zip(layers, thresholds)):
        # compute dropout mask
        if len(layer.weight_mu.shape) > 2:
            if mask is None:
                mask = [True]*layer.in_channels
            else:
                mask = np.copy(next_mask)

            log_alpha = layers[i].get_log_dropout_rates().cpu().data.numpy()
            next_mask = log_alpha <= thresholds[i]

            weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
            weight_mask = weight_mask[:,:,None,None]
        else:
            if mask is None:
                log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
                mask = log_alpha <= threshold
            elif len(weight_mask.shape) > 2:
                temp = next_mask.repeat(layer.in_features/next_mask.shape[0])
                log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
                mask = log_alpha <= threshold
                #mask = mask | temp  ##Upper bound for number of weights at first fully connected layer
                mask = mask & temp   ##Lower bound for number of weights at fully connected layer
            else:
                mask = np.copy(next_mask)

            try:
                log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy()
                next_mask = log_alpha <= thresholds[i + 1]
            except:
                # must be the last mask
                next_mask = np.ones(10)

            weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
        weight_masks.append(weight_mask.astype(np.float))
    return weight_masks

def model_kl_div(self):
    KLD = 0
    for layer in self.kl_list:
        KLD += layer.layer_kl_div()
    return KLD`

Does it cause by high variance? But I have tried to clip variance, it doesn't work...

May 23 '19 09:05 kuanzi