mxnet-the-straight-dope
mxnet-the-straight-dope copied to clipboard
Gradient has not been updated by backward since last step
@ck37 @jergason @rravu3 @sebg @mbrookes I have the problem. How can I solve it?
UserWarning: Gradient of Parameter multimodel0_embedding0_weight on context cpu(0) has not been updated by backward since last step. This could mean a bug in your model that maked it only use a subset of the Parameters (Blocks) for this iteration. If you are intentionally only using a subset, call step with ignore_stale_grad=True to suppress this warning and skip updating of Parameters with stale gradient
In which notebook?
@kevinthesun It's not in these notebooks. I just encounter the warning when I build a neural network. It happens when I use the embedding layer in a for-loop according to my data.
My code is here:
def __init__(self, mode, vocab_size, num_embed, num_hidden, num_layers, dropout=0.5, tie_weights=False, **kwargs): super(Multimodel, self).__init__(**kwargs) with self.name_scope(): self.net = gluon.nn.Sequential() self.net.add(nn.Dropout(dropout), nn.Embedding(vocab_size, num_embed, weight_initializer=mx.init.Uniform(0.1)), rnn.LSTM(num_hidden, num_layers, dropout=dropout, input_size=num_embed) ) self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, input_size=num_embed) self.dense = gluon.nn.Dense(Config.args_Num_class, activation="relu") self.num_hidden = num_hidden
news_sentence = ndarray.empty([sen_num, batch_size, self.num_hidden], ctx=context) for index, _ in enumerate(news_text): out_sen_rnn = self.net(news_text[index]) news_sentence[index] = out_sen_rnn[sen_len-1] news_sentence = news_sentence.transpose([1, 0, 2]) out_sen_rnn = self.rnn(news_sentence) output = self.dense(out_sen_rnn)
UserWarning: Gradient of Parameter
multimodel0_embedding0_weighton context cpu(0) has not been updated by backward since last
step. This could mean a bug in your model that maked it only use a subset of the Parameters (Blocks) for this iteration. If you are intentionally only using a subset, call step with ignore_stale_grad=True to suppress this warning and skip updating of Parameters with stale gradient
@Gyaya Usually this issue comes from training. Can you post your training code? After you do backward for a network, you want to make sure all the gradients computed by backward are used to update network parameters. If you don't want to update some parts of your network, you should detach the output: https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=detach#mxnet.ndarray.NDArray.detach so that gradients of this part of network are not computed.
@kevinthesun
Last waring has been solved. But the loss and accuracy in each epoch don't change. I don't know why this happned.
Here is my network defination
`import mxnet as mx from mxnet import gluon, autograd from mxnet.gluon import nn, rnn from mxnet import ndarray from PIL import Image import os import numpy as np from pprint import pprint import sys import data_helper
class Config(): args_data = './data/nlp/ptb.' args_imgpath = "./data/images" args_model = 'lstm' args_emsize = 128 args_nhid = 128 args_nlayers = 2 args_lr = 0.1 args_clip = 0.2 args_epochs = 20 args_batch_size = 10 args_bptt = 10 args_dropout = 0.2 args_tied = True args_cuda = 'store_true' args_log_interval = 500 args_save = 'model_2lyer.param' args_vocab_size = 96806 args_senLen_keepProb = 0.6 args_senNum_keepProb = 0.6 args_imgNum_keepProb = 0.6 args_Num_class = 3
class Sentence_rnn(nn.Block): def init(self, mode, vocab_size, num_embed, num_hidden, num_layers, dropout, model=None, **kwargs): super(Sentence_rnn, self).init(**kwargs) with self.name_scope(): self.drop = nn.Dropout(dropout) if model is not None: self.encoder = nn.Embedding(vocab_size, num_embed, params=model.encoder.collect_params()) self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, input_size=num_embed, params=model.rnn.collect_params()) else: self.encoder = nn.Embedding(vocab_size, num_embed, weight_initializer=mx.init.Uniform(0.1)) self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout, input_size=num_embed) self.num_hidden = num_hidden
def forward(self, input, text_struct, context, *args):
#embedding = self.drop(self.encoder(input))
embedding = self.encoder(input)
output = self.rnn(embedding)
output = data_helper.getTail(output, text_struct, context)
return output
class Multimodel(gluon.Block): """A model with an encoder, recurrent layer, and a decoder."""
def __init__(self, mode, vocab_size, num_embed, num_hidden,
num_layers, sen_num, image_num, dropout=0.5,**kwargs):
super(Multimodel, self).__init__(**kwargs)
with self.name_scope():
self.net_rnn = gluon.nn.Sequential()
with self.net_rnn.name_scope():
self.net_rnn_0 = Sentence_rnn(mode, vocab_size, num_embed, num_hidden,
num_layers, dropout)
self.net_rnn.add(self.net_rnn_0)
for i in range(1, sen_num):
self.net_rnn.add(Sentence_rnn(mode, vocab_size, num_embed, num_hidden,
num_layers, dropout, model=self.net_rnn_0))
self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout,
input_size=num_embed)
self.dense = gluon.nn.Dense(Config.args_Num_class, activation="relu")
def _normalizeText(self, text, sen_len, sen_num):
#sen_len * sen_num
struct = []
struct_text = []
for key in text.keys():
sentence = text[key]
if len(sentence)<sen_len:
struct.append(len(sentence))
sentence.extend([0 for i in range(sen_len-len(sentence))])
else:
sentence = sentence[0:sen_len]
struct.append(sen_len)
struct_text.append(sentence)
if (len(struct_text)) < sen_num:
struct_text.extend([[0 for j in range(sen_len)] for i in range(sen_num-len(struct_text))])
struct.extend([0 for _ in range(sen_num - len(struct))])
else:
struct_text = struct_text[0:sen_num]
struct = struct[0:sen_num]
return struct_text, struct
#(data_text, data_image, news_id_batches[i])
def forward(self, data_text, data_image, news_ids, sen_len, sen_num, image_num, hidden, context):
news_struct = []
news_text = []
# 数据预处理
batch_size = len(news_ids)
for news_id in news_ids:
# print(news_id)
text, struct = self._normalizeText(data_text[news_id], sen_len, sen_num)
news_struct.append(struct)
news_text.append(text)
news_text = mx.nd.array(news_text, ctx=context).transpose([1, 2, 0])
news_struct = np.array(news_struct).transpose([1, 0])
# 新闻文本处理
news_sentence = ndarray.zeros([1, batch_size, Config.args_nhid])
# news_sentence = ndarray.empty([sen_num, batch_size, Config.args_nhid], ctx=context)
for i, b in enumerate(self.net_rnn):
unit_out = b(news_text[i], news_struct[i], context).reshape([1, batch_size, Config.args_nhid])
news_sentence = ndarray.concat(news_sentence, unit_out, dim=0)
news_sentence = news_sentence[1:news_sentence.shape[0]-1]
# print("news_sentence", news_sentence.shape)
output, hidden = self.rnn(news_sentence, hidden)
output = output.transpose([1, 0, 2])
output = self.dense(output)
return output, hidden
def begin_state(self, *args, **kwargs):
return self.rnn.begin_state(*args, **kwargs)
}
Here is my train Function `def detach(hidden): if isinstance(hidden, (tuple, list)): hidden = [i.detach() for i in hidden] else: hidden = hidden.detach() return hidden
def eval(data_source): total_L = 0.0 ntotal = 0 train_acc = 0.0 i = 0 hidden = model.begin_state(func = mx.nd.zeros, batch_size = config.args_batch_size, ctx=context) for (data_text, data_image, target) in get_batch(data_source): output, hidden = model(data_text, data_image, data_source[i], sen_len, sen_num, image_num, hidden, context) L = loss(output, target) total_L += mx.nd.sum(L).asscalar() ntotal += L.size train_acc += accuracy(output, target) i = i + 1 return total_L / ntotal, train_acc/data_source.shape[0]
def get_batch(news_id_batches): for index in range(news_id_batches.shape[0]): targets = [] data_image = {} data_text = {} for id in news_id_batches[index]: targets.append(labels[id]) data_image[id] = x_image[id] data_text[id] = x_text[id] yield data_text, data_image, mx.ndarray.array(targets, ctx=context)
def accuracy(output, label): return ndarray.mean(output.argmax(axis=1) == label).asscalar()
def train(): best_val = float("Inf") for epoch in range(config.args_epochs): train_acc = 0.0 train_loss = 0.0 start_time = time.time() hidden = model.begin_state(func = mx.nd.zeros, batch_size = config.args_batch_size, ctx = context) for i, (data_text, data_image, target) in enumerate(get_batch(train_news_id_batches)): hidden = detach(hidden) with autograd.record(): output, hidden = model(data_text, data_image, news_id_batches[i], sen_len, sen_num, image_num, hidden, context) L = loss(output, target) L.backward()
trainer.step(config.args_batch_size)
train_loss += mx.nd.mean(L).asscalar()
train_acc += accuracy(output, target)
if i % config.args_bptt == 0 and i > 0:
cur_L = train_loss / i
cur_ACC = train_acc / i
print('[Epoch %d Batch %d] loss %.2f, accuracy %.2f' % (
epoch + 1, i, cur_L, cur_ACC))
val_loss, val_acc = eval(val_news_id_batches)
print('[Epoch %d] time cost %.2fs, validation loss %.2f, validation accuracy %.2f' % (
epoch + 1, time.time() - start_time, val_loss, val_acc))
if val_loss < best_val:
best_val = val_loss
test_loss, test_acc = eval(test_news_id_batches)
model.save_params(config.args_save)
print('test loss %.2f, test accuracy %.2f' % (test_loss, test_acc))
else:
args_lr = config.args_lr * 0.25
trainer._init_optimizer('sgd',
{'learning_rate': args_lr,
'momentum': 0,
'wd': 0})
# model.load_params(config.args_save, context)
train() model.load_params(config.args_save, context)
test_loss, test_acc = eval(test_news_id_batches) print('Best test loss %.2f, test accuracy %.2f'%(test_loss, test_acc))`
@kevinthesun - what's the status of this issue? Should I close?
@Gyaya https://github.com/kevinthesun/mxnet/blob/GluonDebugTutorial/docs/tutorials/gluon/gluon-debug.ipynb Take a look at this debug tutorial
I have met the same issue for time series, and the common part between my code and this code is the reshape() in forward(). And got the error at the point trainer.step(batch_size).
hello ,author, i meet the same question.could you tell me how to solve the warning
I have met the same issue for time series, and the common part between my code and this code is the reshape() in forward(). And got the error at the point trainer.step(batch_size). could you tell me how to solve the warning