deep_q_rl
deep_q_rl copied to clipboard
UnusedInputError
Hi,
Thanks for reading this post.
Currently, I am trying to create my own network for reinforcement learning. To this end, I have adapted the Q network from Playing Atari with Deep Reinforcement Learning Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller and Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533.
When theano tries to compile function for loss and q_val (),
self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens_train) self._q_vals = theano.function([], q_vals, givens=givens_q_val)
it keeps returning UnusedInputError: theano.function was asked to create a function computing outputs given certain inputs, but the provided input variable at index 0 is not part of the computational graph needed to compute the outputs: <CudaNdarrayType(float32, 4D)>. To make this error into a warning, you can pass the parameter on_unused_input='warn' to theano.function. To disable it completely, use on_unused_input='ignore'.
I have been debugging the code many many times, but I cannot understand why the inputs (from givens) are not used as part of the function/ calculation.
Many thanks in advance for your explanation.
Here is my full source code for the network:
""" import lasagne import numpy as np import theano import theano.tensor as T from updates import deepmind_rmsprop import logging
class DeepQLearner: """ Deep Q-learning network using Lasagne. """ def init(self, width_img, height_img, width_loc, height_loc, width_his, height_his, target_dis_size, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=8.0):
self.width_img = width_img
self.height_img = height_img
self.width_loc = width_loc
self.height_loc = height_loc
self.width_his = width_his
self.height_his = height_his
self.target_dis_size = target_dis_size
self.num_actions = num_actions
self.num_frames = num_frames
self.batch_size = batch_size
self.discount = discount
self.rho = rho
self.lr = learning_rate
self.rms_epsilon = rms_epsilon
self.momentum = momentum
self.clip_delta = clip_delta
self.freeze_interval = freeze_interval
self.rng = rng
self.logger = logging.getLogger(__name__)
if not getattr(self.logger, 'handler_set', None):
self.logger.setLevel(logging.DEBUG)
# create a file handler
handler = logging.FileHandler('toy.log', mode='a')
handler.setLevel(logging.DEBUG)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
self.logger.addHandler(handler)
self.logger.handler_set = True
self.logger.info('initialise a Q network.')
lasagne.random.set_rng(self.rng)
self.update_counter = 0
self.l_out = self.build_network(network_type, num_actions, num_frames, batch_size)
if self.freeze_interval > 0:
self.next_l_out = self.build_network(network_type, num_actions,
num_frames, batch_size)
self.reset_q_hat()
#states = T.tensor4('states')
#next_states = T.tensor4('next_states')
imgs = T.tensor4('imgs')
next_imgs = T.tensor4('next_imgs')
locs = T.tensor4('locs')
next_locs = T.tensor4('next_locs')
hiss = T.tensor4('hiss')
next_hiss = T.tensor4('next_hiss')
target_distribution = T.tensor('target_distribution')
next_target_distribution = T.tensor('next_target_distribution')
sds = T.icol('sds')
next_sds = T.icol('next_sds')
rewards = T.col('rewards')
actions = T.icol('actions')
terminals = T.icol('terminals')
self.states_shared = theano.shared(
np.zeros((batch_size, num_frames, input_height, input_width),
dtype=theano.config.floatX))
self.next_states_shared = theano.shared(
np.zeros((batch_size, num_frames, input_height, input_width),
dtype=theano.config.floatX))
self.imgs_shared = theano.shared(
np.zeros((batch_size, num_frames, width_img, height_img),
dtype=theano.config.floatX))
self.next_imgs_shared = theano.shared(
np.zeros((batch_size, num_frames, width_img, height_img),
dtype=theano.config.floatX))
self.locs_shared = theano.shared(
np.zeros((batch_size, num_frames, width_loc, height_loc),
dtype=theano.config.floatX))
self.next_locs_shared = theano.shared(
np.zeros((batch_size, num_frames, width_loc, height_loc),
dtype=theano.config.floatX))
self.hiss_shared = theano.shared(
np.zeros((batch_size, num_frames, width_his, height_his),
dtype=theano.config.floatX))
self.next_hiss_shared = theano.shared(
np.zeros((batch_size, num_frames, width_his, height_his),
dtype=theano.config.floatX))
self.sds_shared = theano.shared(
np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
self.next_sds_shared= theano.shared(
np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
self.rewards_shared = theano.shared(
np.zeros((batch_size, 1), dtype=theano.config.floatX),
broadcastable=(False, True))
self.actions_shared = theano.shared(
np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
self.terminals_shared = theano.shared(
np.zeros((batch_size, 1), dtype='int32'),
broadcastable=(False, True))
q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
# massage/ unpack states into the right form for multi input network
q_vals = lasagne.layers.get_output(self.l_out, {'l_in':imgs,
'l_loc1':locs, 'l_his':hiss,
'l_dis': sds})
if self.freeze_interval > 0:
next_q_vals = lasagne.layers.get_output(self.next_l_out,
next_states / input_scale)
next_q_vals = lasagne.layers.get_output(self.next_l_out,
{'l_in':next_imgs,
'l_loc1':next_locs, 'l_his':next_hiss,
'l_dis': next_sds})
else:
next_q_vals = lasagne.layers.get_output(self.l_out,
{'l_in':next_imgs,
'l_loc1':next_locs, 'l_his':next_hiss,
'l_dis': next_sds})
next_q_vals = theano.gradient.disconnected_grad(next_q_vals)
target = (rewards +
(T.ones_like(terminals) - terminals) *
self.discount * T.max(next_q_vals, axis=1, keepdims=True))
diff = target - q_vals[T.arange(batch_size),
actions.reshape((-1,))].reshape((-1, 1))
if self.clip_delta > 0:
# If we simply take the squared clipped diff as our loss,
# then the gradient will be zero whenever the diff exceeds
# the clip bounds. To avoid this, we extend the loss
# linearly past the clip point to keep the gradient constant
# in that regime.
#
# This is equivalent to declaring d loss/d q_vals to be
# equal to the clipped diff, then backpropagating from
# there, which is what the DeepMind implementation does.
quadratic_part = T.minimum(abs(diff), self.clip_delta)
linear_part = abs(diff) - quadratic_part
loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
else:
loss = 0.5 * diff ** 2
if batch_accumulator == 'sum':
loss = T.sum(loss)
elif batch_accumulator == 'mean':
loss = T.mean(loss)
else:
raise ValueError("Bad accumulator: {}".format(batch_accumulator))
params = lasagne.layers.helper.get_all_params(self.l_out)
givens_train = {
states: self.states_shared,
next_states: self.next_states_shared,
imgs :self.imgs_shared,
next_imgs :self.next_imgs_shared,
locs: self.locs_shared ,
next_locs :self.next_locs_shared,
hiss :self.hiss_shared,
next_hiss :self.next_hiss_shared ,
sds : self.sds_shared ,
next_sds : self.next_sds_shared ,
rewards: self.rewards_shared,
actions: self.actions_shared,
terminals: self.terminals_shared
}
givens_q_val = {
states: self.states_shared,
next_states: self.next_states_shared,
imgs :self.imgs_shared,
locs: self.locs_shared ,
hiss :self.hiss_shared,
sds : self.sds_shared
rewards: self.rewards_shared,
actions: self.actions_shared,
terminals: self.terminals_shared
}
if update_rule == 'deepmind_rmsprop':
updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
self.rms_epsilon)
elif update_rule == 'rmsprop':
updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
self.rms_epsilon)
elif update_rule == 'sgd':
updates = lasagne.updates.sgd(loss, params, self.lr)
else:
raise ValueError("Unrecognized update: {}".format(update_rule))
if self.momentum > 0:
updates = lasagne.updates.apply_momentum(updates, None,
self.momentum)
self._train = theano.function([], [loss, q_vals], updates=updates,
givens=givens_train)
self._q_vals = theano.function([], q_vals,
givens=givens_q_val)
def build_network(self, network_type, output_dim, num_frames, batch_size):
if network_type == "myOwn":
return self.build_myNetwork(output_dim, num_frames, batch_size)
else:
raise ValueError("Unrecognized network: {}".format(network_type))
def build_myNetwork(self, output_dim, num_frames, batch_size):
from lasagne.layers import dnn
l_in = lasagne.layers.InputLayer(
shape=(batch_size, num_frames, self.width_img, self.height_img)
)
l_conv1 = dnn.Conv2DDNNLayer(
l_in,
num_filters=32,
filter_size=(8, 8),
stride=(3, 3),
nonlinearity=lasagne.nonlinearities.rectify,
W=lasagne.init.HeUniform(),
b=lasagne.init.Constant(.1)
)
l_conv2 = dnn.Conv2DDNNLayer(
l_conv1,
num_filters=64,
filter_size=(4, 4),
stride=(1, 1),
nonlinearity=lasagne.nonlinearities.rectify,
W=lasagne.init.HeUniform(),
b=lasagne.init.Constant(.1)
)
l_conv3 = dnn.Conv2DDNNLayer(
l_conv2,
num_filters=64,
filter_size=(3, 3),
stride=(1, 1),
nonlinearity=lasagne.nonlinearities.rectify,
W=lasagne.init.HeUniform(),
b=lasagne.init.Constant(.1)
)
l_loc1 = lasagne.layers.InputLayer(
shape=(batch_size, num_frames, self.width_loc, self.height_loc)
)
n = 64
l_loc2 = lasagne.layers.DenseLayer(l_loc1, num_units=n)
#history = np.zeros((batch_size, num_frames, 4, 24*24), dtype=int)
l_his = lasagne.layers.InputLayer(
shape=(batch_size, num_frames, self.width_his, self.height_his)
)
l_his2 = lasagne.layers.DenseLayer(l_his, num_units=n)
l_dis = lasagne.layers.InputLayer(
shape=(batch_size, num_frames, self.target_dis_size)
)
l_dis2 = lasagne.layers.DenseLayer(l_dis, num_units=n)
l_conv4 = lasagne.layers.ReshapeLayer(l_conv3, (batch_size, 1, -1))
l_loc2 = lasagne.layers.ReshapeLayer(l_loc2, (batch_size,1,-1))
l_his2 = lasagne.layers.ReshapeLayer(l_his2, (batch_size,1,-1))
l_dis2 = lasagne.layers.ReshapeLayer(l_dis2, (batch_size,1,-1))
l_merge = lasagne.layers.ElemwiseSumLayer((l_conv4,l_loc2, l_his2, l_dis2 ))
print (l_conv4.output_shape)
print l_loc2.output_shape
print l_his2.output_shape
print l_dis2.output_shape
print l_merge.output_shape
l_hidden1 = lasagne.layers.DenseLayer(
l_merge,
num_units=320,
nonlinearity=lasagne.nonlinearities.rectify,
W=lasagne.init.HeUniform(),
b=lasagne.init.Constant(.1)
)
#
l_out = lasagne.layers.DenseLayer(
l_hidden1,
num_units=output_dim,
nonlinearity=None,
W=lasagne.init.HeUniform(),
b=lasagne.init.Constant(.1)
)
return l_out
def train(self, imgs ,next_imgs , locs, next_locs ,hiss,
next_hiss, sds ,
next_sds,
actions, rewards, terminals):
"""
Train one batch.
Arguments:
states - b x f x h x w numpy array, where b is batch size,
f is num frames, h is height and w is width.
actions - b x 1 numpy array of integers
rewards - b x 1 numpy array
next_states - b x f x h x w numpy array
terminals - b x 1 numpy boolean array (currently ignored)
Returns: average loss
"""
self.imgs_shared.set_value(imgs)
self.next_imgs_shared.set_value(next_imgs)
self.locs_shared.set_value(locs)
self.next_locs_share.set_value(next_locs)
self.hiss_shared.set_value(hiss)
self.next_hiss_shared.set_value(next_hiss)
self.sds_shared.set_value(sds)
self.next_sds_shared.set_value(next_sds)
self.states_shared.set_value(states)
self.next_states_shared.set_value(next_states)
self.actions_shared.set_value(actions)
self.rewards_shared.set_value(rewards)
self.terminals_shared.set_value(terminals)
if (self.freeze_interval > 0 and
self.update_counter % self.freeze_interval == 0):
self.reset_q_hat()
loss, _ = self._train()
self.update_counter += 1
return np.sqrt(loss)
def q_vals(self, img , loc, his, sd):
states = np.zeros((self.batch_size, self.num_frames, self.input_height,
self.input_width), dtype=theano.config.floatX)
states[0, ...] = state
self.states_shared.set_value(states)
imgs = np.zeros((self.batch_size, self.num_frames, self.height_img,
self.width_img), dtype=theano.config.floatX)
imgs[0, ...] = img
locs = np.zeros((self.batch_size, self.num_frames, self.height_loc,
self.width_loc), dtype=theano.config.floatX)
locs[0, ...] = loc
hiss = np.zeros((self.batch_size, self.num_frames, self.height_his,
self.width_his), dtype=theano.config.floatX)
hiss[0,...] = his
sds = np.zeros((self.batch_size, self.num_frames, self.target_dis_size),
dtype='int32')
sds[0, ...] = sd
self.imgs_shared.set_value(imgs)
self.locs_shared.set_value(locs)
self.hiss_shared.set_value(hiss)
self.sds_shared.set_value(sds)
return self._q_vals()[0]
def choose_action(self, img , loc, his, sd, epsilon):
if self.rng.rand() < epsilon:
return self.rng.randint(0, self.num_actions)
q_vals = self.q_vals(img , loc, his, sd)
return np.argmax(q_vals)
def reset_q_hat(self):
all_params = lasagne.layers.helper.get_all_param_values(self.l_out)
lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params)
def main():
net = DeepQLearner(84, 84, 16, 4, .99, .00025, .95, .95, 10000,
#32, 'nature_cuda')
width_img = 24
height_img = 24
width_loc = 1
height_loc = 3
width_his = width_img *height_img
height_his = 4
target_dis_size = 1
num_actions = 9
num_frames = 1
discount = 0.99
learning_rate = .00025
rho = 0.95
rms_epsilon = 0.95
momentum = 0.95
clip_delta = 1
freeze_interval = 100
batch_size = 100
network_type = 'myOwn'
update_rule = 'deepmind_rmsprop'
batch_accumulator ='sum'
rng = np.random.RandomState(123456)
net = DeepQLearner(width_img,
height_img,
width_loc,
height_loc,
width_his,
height_his,
target_dis_size,
num_actions, num_frames, discount, learning_rate, rho,
rms_epsilon, momentum, clip_delta, freeze_interval,
batch_size, network_type, update_rule,
batch_accumulator, rng)
if name == 'main': main()