TensorLayer icon indicating copy to clipboard operation
TensorLayer copied to clipboard

A error about Q_learning

Open guest-oo opened this issue 4 months ago • 0 comments

a = np.argmax(Q[s, :] + np.random.randn(1, env.action_space.n) * (1. / (i + 1))) IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices

import argparse import os import time

import gym import matplotlib.pyplot as plt import numpy as np

parser = argparse.ArgumentParser() parser.add_argument('--train', dest='train', action='store_true', default=True) parser.add_argument('--test', dest='test', action='store_true', default=True)

parser.add_argument( '--save_path', default=None, help='folder to save if mode == train else model path,' 'qnet will be saved once target net update' ) parser.add_argument('--seed', help='random seed', type=int, default=0) parser.add_argument('--env_id', default='FrozenLake-v1') args = parser.parse_args()

Load the environment

alg_name = 'Qlearning' env_id = args.env_id env = gym.make(env_id) render = False # display the game environment

##================= Implement Q-Table learning algorithm =====================##

Initialize table with all zeros

Q = np.zeros([env.observation_space.n, env.action_space.n])

Set learning parameters

lr = .85 # alpha, if use value function approximation, we can ignore it lambd = .99 # decay factor num_episodes = 10000 t0 = time.time()

if args.train: all_episode_reward = [] for i in range(num_episodes): ## Reset environment and get first new observation s = env.reset() rAll = 0 ## The Q-Table learning algorithm for j in range(99): if render: env.render() ## Choose an action by greedily (with noise) picking from Q table a = np.argmax(Q[s, :] + np.random.randn(1, env.action_space.n) * (1. / (i + 1))) ## Get new state and reward from environment s1, r, d, _ = env.step(a) ## Update Q-Table with new knowledge Q[s, a] = Q[s, a] + lr * (r + lambd * np.max(Q[s1, :]) - Q[s, a]) rAll += r s = s1 if d is True: break print( 'Training | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( i + 1, num_episodes, rAll, time.time() - t0 ) ) if i == 0: all_episode_reward.append(rAll) else: all_episode_reward.append(all_episode_reward[-1] * 0.9 + rAll * 0.1)

# save
path = os.path.join('model', '_'.join([alg_name, env_id]))
if not os.path.exists(path):
    os.makedirs(path)
np.save(os.path.join(path, 'Q_table.npy'), Q)

plt.plot(all_episode_reward)
if not os.path.exists('image'):
    os.makedirs('image')
plt.savefig(os.path.join('image', '_'.join([alg_name, env_id])))

# print("Final Q-Table Values:/n %s" % Q)

if args.test: path = os.path.join('model', '_'.join([alg_name, env_id])) Q = np.load(os.path.join(path, 'Q_table.npy')) for i in range(num_episodes): ## Reset environment and get first new observation s = env.reset() rAll = 0 ## The Q-Table learning algorithm for j in range(99): ## Choose an action by greedily (with noise) picking from Q table a = np.argmax(Q[s, :]) ## Get new state and reward from environment s1, r, d, _ = env.step(a) ## Update Q-Table with new knowledge rAll += r s = s1 if d is True: break print( 'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( i + 1, num_episodes, rAll, time.time() - t0 ) )

a = np.argmax(Q[s, :] + np.random.randn(1, env.action_space.n) * (1. / (i + 1))) IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices

guest-oo avatar Oct 19 '24 07:10 guest-oo