Error when training basicvsr x2 model after 300k iterations
I modified the basicvsr_arch.py to train the x2 model. It worked fine when I trained for the first 300k iterations. After 300k iterations, there is an error. I tried to resume the training but the error happened again. It seems to be an error about setting the learning rate.
basicsr/models/lr_scheduler.py", line 88, in get_lr current_weight = self.restart_weights[idx] TypeError: list indices must be integers or slices, not NoneType
Full error log from terminal
Modification of class BasicVSR in basicvsr_arch.py
`
class BasicVSR(nn.Module):
def __init__(self, num_feat=64, num_block=15, spynet_path=None):
super().__init__()
self.num_feat = num_feat
# alignment
self.spynet = SpyNet(spynet_path)
# propagation
self.backward_trunk = ConvResidualBlocks(num_feat + 3, num_feat, num_block)
self.forward_trunk = ConvResidualBlocks(num_feat + 3, num_feat, num_block)
# reconstruction
self.fusion = nn.Conv2d(num_feat * 2, num_feat, 1, 1, 0, bias=True)
self.upconv1 = nn.Conv2d(num_feat, num_feat * 4, 3, 1, 1, bias=True)
# self.upconv2 = nn.Conv2d(num_feat, 64 * 4, 3, 1, 1, bias=True)
self.conv_hr = nn.Conv2d(64, 64, 3, 1, 1)
self.conv_last = nn.Conv2d(64, 3, 3, 1, 1)
self.pixel_shuffle = nn.PixelShuffle(2)
# activation functions
self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
def get_flow(self, x):
b, n, c, h, w = x.size()
x_1 = x[:, :-1, :, :, :].reshape(-1, c, h, w)
x_2 = x[:, 1:, :, :, :].reshape(-1, c, h, w)
flows_backward = self.spynet(x_1, x_2).view(b, n - 1, 2, h, w)
flows_forward = self.spynet(x_2, x_1).view(b, n - 1, 2, h, w)
return flows_forward, flows_backward
def forward(self, x):
flows_forward, flows_backward = self.get_flow(x)
b, n, _, h, w = x.size()
# backward branch
out_l = []
feat_prop = x.new_zeros(b, self.num_feat, h, w)
for i in range(n - 1, -1, -1):
x_i = x[:, i, :, :, :]
if i < n - 1:
flow = flows_backward[:, i, :, :, :]
feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1))
feat_prop = torch.cat([x_i, feat_prop], dim=1)
feat_prop = self.backward_trunk(feat_prop)
out_l.insert(0, feat_prop)
# forward branch
feat_prop = torch.zeros_like(feat_prop)
for i in range(0, n):
x_i = x[:, i, :, :, :]
if i > 0:
flow = flows_forward[:, i - 1, :, :, :]
feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1))
feat_prop = torch.cat([x_i, feat_prop], dim=1)
feat_prop = self.forward_trunk(feat_prop)
# upsample
out = torch.cat([out_l[i], feat_prop], dim=1)
out = self.lrelu(self.fusion(out))
out = self.lrelu(self.pixel_shuffle(self.upconv1(out)))
# out = self.lrelu(self.pixel_shuffle(self.upconv2(out)))
out = self.lrelu(self.conv_hr(out))
out = self.conv_last(out)
# base = F.interpolate(x_i, scale_factor=4, mode='bilinear', align_corners=False)
base = F.interpolate(x_i, scale_factor=2, mode='bilinear', align_corners=False)
out += base
out_l[i] = out
return torch.stack(out_l, dim=1)
`
Thanks for reporting this bug.
It may be a bug from the scheduler: when your resuming iteration equals the periods in the scheduler, it will give None return
. I will look deep into this bug later.
For a quick fix, you may try to resume from 295000.state instead of 300000.state.
This bug still exists.🥲
This bug seems to still exist today, looking forward to fixing it😣